In [1]:
#source('functions.R')
source('utils_preprocess.R')

MYLIBRARIES<-c("outliers",
               "corrplot",
               "MASS",
               "formattable",
               "stats",
               "caret",
               "PerformanceAnalytics",
               "smotefamily",
               "tidyverse",
               #"ElemStatLearn",
               #"imputation",
               "naivebayes",
               'dplyr',
               "stringr",
               "partykit",
               "C50",
               "randomForest",
               "keras",
               "kohonen",
  "cluster",
  "ggplot2",
  "formattable",
  "factoextra",
  "gridExtra",
  "igraph","unix"
               #"h2o"
               #'ggplot2'
               #"KODAMA"
              )
#install.packages("KODAMA")
#install.packages('lattice')
#install.packages('caret')
#install.packages('e1071')
install.packages('pacman')
#install.packages('ggplot')
# library(dplyr) # for transforming to tibble object
# library(caret)
#library(e1071)

# ************************************************
# This is where R starts execution

# Automatically release memory
gc()

# Tries to clear plots and other graphics in RStudio output
if(!is.null(dev.list())) dev.off()
graphics.off()

# This clears all warning messages
#assign("last.warning", NULL, envir = baseenv())

# clears the RStudio console area
cat("\014")

# If library not already on your computer this will download and
# install the library. Each library is activated.
library(pacman)
pacman::p_load(char=MYLIBRARIES,install=TRUE,character.only=TRUE)
library(dplyr) # for transforming to tibble object
library(caret)
library(unix)
Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

A matrix: 2 × 6 of type dbl
used(Mb)gc trigger(Mb)max used(Mb)
Ncells 64920034.7134063271.6134063271.6
Vcells1277278 9.8838860864.0498012038.0
null device: 1


*conda install -c r r-tidyverse*¶

Data Understanding¶

Load a passenger satisfaction dataset¶

Load 2 version of a Airline passenger satisfaction dataset¶

In this project we will be using 2 versions of the same dataset to analyse the passenger's satisfaction to the airline. We will first combine both versions.

In [2]:
dataset1<-NreadDataset('./dataset/satisfaction.csv')
dataset2<-NreadDataset('./dataset/satisfaction_2015.csv')
[1] "CSV dataset ./dataset/satisfaction.csv has been read. Records= 129880"
[1] "CSV dataset ./dataset/satisfaction_2015.csv has been read. Records= 129880"

Check the similar field in both dataset¶

Before merging both datasets together, It is necessary for both dataset to have same number of columns with the same name and attribute.

In [3]:
# remove ID field which index is at 1 (-1 will remove index at position 1 )
dataset1 = dataset1[, -which(names(dataset1) %in% c('id'))]
dataset2 = dataset2[, -which(names(dataset2) %in% c('id'))]

print(paste("Names of Similar fields : ",names(dataset1[,(names(dataset1) %in% names(dataset2))])))
 [1] "Names of Similar fields :  satisfactionv2"                
 [2] "Names of Similar fields :  Gender"                        
 [3] "Names of Similar fields :  CustomerType"                  
 [4] "Names of Similar fields :  Age"                           
 [5] "Names of Similar fields :  TypeofTravel"                  
 [6] "Names of Similar fields :  Class"                         
 [7] "Names of Similar fields :  FlightDistance"                
 [8] "Names of Similar fields :  Seatcomfort"                   
 [9] "Names of Similar fields :  DepartureArrivaltimeconvenient"
[10] "Names of Similar fields :  Foodanddrink"                  
[11] "Names of Similar fields :  Gatelocation"                  
[12] "Names of Similar fields :  Inflightwifiservice"           
[13] "Names of Similar fields :  Inflightentertainment"         
[14] "Names of Similar fields :  EaseofOnlinebooking"           
[15] "Names of Similar fields :  Onboardservice"                
[16] "Names of Similar fields :  Legroomservice"                
[17] "Names of Similar fields :  Baggagehandling"               
[18] "Names of Similar fields :  Checkinservice"                
[19] "Names of Similar fields :  Cleanliness"                   
[20] "Names of Similar fields :  Onlineboarding"                
[21] "Names of Similar fields :  DepartureDelayinMinutes"       
[22] "Names of Similar fields :  ArrivalDelayinMinutes"         

Check different in both Dataframe¶

In [4]:
# Field that dataset 1 have but dataset 2 doesnt have
diff1<-names(dataset1)[-which(names(dataset1) %in% names(dataset2))]

# Field that dataset 2 have but dataset 1 doesnt have 
diff2<-names(dataset2)[-which(names(dataset2) %in% names(dataset1))]
differ <-c(diff1,diff2)
print(paste("Name of different fields : ",differ))
[1] "Name of different fields :  Onlinesupport"  
[2] "Name of different fields :  Inflightservice"

We can see that both versions of the dataset contain 22 similar fields with the same names and attributes, wheres only one field is different between the two. These are the *('Onlinesupport')* and *('Inflightservice')*.

Field a different with NA and combine both dataset¶

Both datasets needs to have the same number of columns to be able to be combined. So we decided to fill NA values to columns with none exiting values.

In [5]:
dataset1[setdiff(names(dataset2), names(dataset1))] <- NA
dataset2[setdiff(names(dataset1), names(dataset2))] <- NA
# combine both tables
combined<-rbind(dataset1, dataset2)
# Visualize combined dataset
print(paste("Number of columns after combined: ", ncol(combined)))
print(paste("Number of rows after combined: ",nrow(combined)))
head(combined)
[1] "Number of columns after combined:  24"
[1] "Number of rows after combined:  259760"
A data.frame: 6 × 24
satisfactionv2GenderCustomerTypeAgeTypeofTravelClassFlightDistanceSeatcomfortDepartureArrivaltimeconvenientFoodanddrink⋯EaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboardingDepartureDelayinMinutesArrivalDelayinMinutesInflightservice
<chr><chr><chr><int><chr><chr><int><int><int><int>⋯<int><int><int><int><int><int><int><int><int><int>
1satisfiedFemaleLoyal Customer65Personal TravelEco 265000⋯3303532 0 0NA
2satisfiedMale Loyal Customer47Personal TravelBusiness2464000⋯3444232310305NA
3satisfiedFemaleLoyal Customer15Personal TravelEco 2138000⋯2334442 0 0NA
4satisfiedFemaleLoyal Customer60Personal TravelEco 623000⋯1101413 0 0NA
5satisfiedFemaleLoyal Customer70Personal TravelEco 354000⋯2202425 0 0NA
6satisfiedMale Loyal Customer30Personal TravelEco 1894000⋯2545542 0 0NA
In [ ]:

The final mergred dataset contains 24 columns and 259760 rows.

Data cleaning¶

Next step, we first visualize a summary of combined dataset to see the its details and statistic

In [6]:
summary(combined)
 satisfactionv2        Gender          CustomerType            Age       
 Length:259760      Length:259760      Length:259760      Min.   : 7.00  
 Class :character   Class :character   Class :character   1st Qu.:27.00  
 Mode  :character   Mode  :character   Mode  :character   Median :40.00  
                                                          Mean   :39.43  
                                                          3rd Qu.:51.00  
                                                          Max.   :85.00  
                                                                         
 TypeofTravel          Class           FlightDistance  Seatcomfort  
 Length:259760      Length:259760      Min.   :  31   Min.   :0.00  
 Class :character   Class :character   1st Qu.: 606   1st Qu.:2.00  
 Mode  :character   Mode  :character   Median :1510   Median :3.00  
                                       Mean   :1586   Mean   :3.14  
                                       3rd Qu.:2297   3rd Qu.:4.00  
                                       Max.   :6951   Max.   :5.00  
                                                                    
 DepartureArrivaltimeconvenient  Foodanddrink    Gatelocation  
 Min.   :0.000                  Min.   :0.000   Min.   :0.000  
 1st Qu.:2.000                  1st Qu.:2.000   1st Qu.:2.000  
 Median :3.000                  Median :3.000   Median :3.000  
 Mean   :3.024                  Mean   :3.028   Mean   :2.984  
 3rd Qu.:4.000                  3rd Qu.:4.000   3rd Qu.:4.000  
 Max.   :5.000                  Max.   :5.000   Max.   :5.000  
                                                               
 Inflightwifiservice Inflightentertainment Onlinesupport    EaseofOnlinebooking
 Min.   :0.000       Min.   :0.000         Min.   :0.00     Min.   :0.000      
 1st Qu.:2.000       1st Qu.:2.000         1st Qu.:3.00     1st Qu.:2.000      
 Median :3.000       Median :4.000         Median :4.00     Median :3.000      
 Mean   :2.989       Mean   :3.371         Mean   :3.52     Mean   :3.114      
 3rd Qu.:4.000       3rd Qu.:4.000         3rd Qu.:5.00     3rd Qu.:4.000      
 Max.   :5.000       Max.   :5.000         Max.   :5.00     Max.   :5.000      
                                           NA's   :129880                      
 Onboardservice  Legroomservice  Baggagehandling Checkinservice 
 Min.   :0.000   Min.   :0.000   Min.   :1.000   Min.   :0.000  
 1st Qu.:3.000   1st Qu.:2.000   1st Qu.:3.000   1st Qu.:3.000  
 Median :4.000   Median :4.000   Median :4.000   Median :3.000  
 Mean   :3.424   Mean   :3.418   Mean   :3.664   Mean   :3.324  
 3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:4.000  
 Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
                                                                
  Cleanliness    Onlineboarding  DepartureDelayinMinutes ArrivalDelayinMinutes
 Min.   :0.000   Min.   :0.000   Min.   :   0.00         Min.   :   0.00      
 1st Qu.:3.000   1st Qu.:2.000   1st Qu.:   0.00         1st Qu.:   0.00      
 Median :4.000   Median :3.000   Median :   0.00         Median :   0.00      
 Mean   :3.496   Mean   :3.303   Mean   :  14.71         Mean   :  15.09      
 3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:  12.00         3rd Qu.:  13.00      
 Max.   :5.000   Max.   :5.000   Max.   :1592.00         Max.   :1584.00      
                                                         NA's   :786          
 Inflightservice 
 Min.   :0.00    
 1st Qu.:3.00    
 Median :4.00    
 Mean   :3.64    
 3rd Qu.:5.00    
 Max.   :5.00    
 NA's   :129880  

After look in through a details of dataset we can see that three columns have a missing values, that are *('ArrivalDelayinMinutes'), ('Onlinesupport') and ('Inflight'). The missing values in ('Onlinesupport')* and *('Inflightservice')* are value that was filled because these two fields were a field that each dataset uniquely has. The amount of missing value in both of these field are 129880 rows which is 50 % of totak number of rows. We decide not filling a missing value by imputation process nor taking mean or meadian of each columns and remove both of these field. Since the number of missing value is 50 % of dataset the value fill might be inaccurate. But for field *('ArrivalDelayMinutes')*, the missing value were filling by using median value in that field

In [7]:
#remove colums which field which have more that 50 % is NA value. Inflightservice and Onlinesupport

rm_combined<-combined[, -which(names(combined) %in% c('Inflightservice','Onlinesupport'))]

# Filling a Null value with median value of that field which is 0
# the reseason of choosing median over mean of the field to prevent a chance of creating new outlier
rm_combined$ArrivalDelayinMinutes[is.na(rm_combined$ArrivalDelayinMinutes)] <- 0
In [8]:
summary(rm_combined)
 satisfactionv2        Gender          CustomerType            Age       
 Length:259760      Length:259760      Length:259760      Min.   : 7.00  
 Class :character   Class :character   Class :character   1st Qu.:27.00  
 Mode  :character   Mode  :character   Mode  :character   Median :40.00  
                                                          Mean   :39.43  
                                                          3rd Qu.:51.00  
                                                          Max.   :85.00  
 TypeofTravel          Class           FlightDistance  Seatcomfort  
 Length:259760      Length:259760      Min.   :  31   Min.   :0.00  
 Class :character   Class :character   1st Qu.: 606   1st Qu.:2.00  
 Mode  :character   Mode  :character   Median :1510   Median :3.00  
                                       Mean   :1586   Mean   :3.14  
                                       3rd Qu.:2297   3rd Qu.:4.00  
                                       Max.   :6951   Max.   :5.00  
 DepartureArrivaltimeconvenient  Foodanddrink    Gatelocation  
 Min.   :0.000                  Min.   :0.000   Min.   :0.000  
 1st Qu.:2.000                  1st Qu.:2.000   1st Qu.:2.000  
 Median :3.000                  Median :3.000   Median :3.000  
 Mean   :3.024                  Mean   :3.028   Mean   :2.984  
 3rd Qu.:4.000                  3rd Qu.:4.000   3rd Qu.:4.000  
 Max.   :5.000                  Max.   :5.000   Max.   :5.000  
 Inflightwifiservice Inflightentertainment EaseofOnlinebooking Onboardservice 
 Min.   :0.000       Min.   :0.000         Min.   :0.000       Min.   :0.000  
 1st Qu.:2.000       1st Qu.:2.000         1st Qu.:2.000       1st Qu.:3.000  
 Median :3.000       Median :4.000         Median :3.000       Median :4.000  
 Mean   :2.989       Mean   :3.371         Mean   :3.114       Mean   :3.424  
 3rd Qu.:4.000       3rd Qu.:4.000         3rd Qu.:4.000       3rd Qu.:4.000  
 Max.   :5.000       Max.   :5.000         Max.   :5.000       Max.   :5.000  
 Legroomservice  Baggagehandling Checkinservice   Cleanliness   
 Min.   :0.000   Min.   :1.000   Min.   :0.000   Min.   :0.000  
 1st Qu.:2.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
 Median :4.000   Median :4.000   Median :3.000   Median :4.000  
 Mean   :3.418   Mean   :3.664   Mean   :3.324   Mean   :3.496  
 3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000  
 Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
 Onlineboarding  DepartureDelayinMinutes ArrivalDelayinMinutes
 Min.   :0.000   Min.   :   0.00         Min.   :   0.00      
 1st Qu.:2.000   1st Qu.:   0.00         1st Qu.:   0.00      
 Median :3.000   Median :   0.00         Median :   0.00      
 Mean   :3.303   Mean   :  14.71         Mean   :  15.05      
 3rd Qu.:4.000   3rd Qu.:  12.00         3rd Qu.:  13.00      
 Max.   :5.000   Max.   :1592.00         Max.   :1584.00      
In [9]:
dataset<-rm_combined

Display Summary of dataset¶

Checking a type of each fields (numeric or symbolic) and also checking for missing value

In [10]:
# displat summary of dataset 
summary(dataset)
 satisfactionv2        Gender          CustomerType            Age       
 Length:259760      Length:259760      Length:259760      Min.   : 7.00  
 Class :character   Class :character   Class :character   1st Qu.:27.00  
 Mode  :character   Mode  :character   Mode  :character   Median :40.00  
                                                          Mean   :39.43  
                                                          3rd Qu.:51.00  
                                                          Max.   :85.00  
 TypeofTravel          Class           FlightDistance  Seatcomfort  
 Length:259760      Length:259760      Min.   :  31   Min.   :0.00  
 Class :character   Class :character   1st Qu.: 606   1st Qu.:2.00  
 Mode  :character   Mode  :character   Median :1510   Median :3.00  
                                       Mean   :1586   Mean   :3.14  
                                       3rd Qu.:2297   3rd Qu.:4.00  
                                       Max.   :6951   Max.   :5.00  
 DepartureArrivaltimeconvenient  Foodanddrink    Gatelocation  
 Min.   :0.000                  Min.   :0.000   Min.   :0.000  
 1st Qu.:2.000                  1st Qu.:2.000   1st Qu.:2.000  
 Median :3.000                  Median :3.000   Median :3.000  
 Mean   :3.024                  Mean   :3.028   Mean   :2.984  
 3rd Qu.:4.000                  3rd Qu.:4.000   3rd Qu.:4.000  
 Max.   :5.000                  Max.   :5.000   Max.   :5.000  
 Inflightwifiservice Inflightentertainment EaseofOnlinebooking Onboardservice 
 Min.   :0.000       Min.   :0.000         Min.   :0.000       Min.   :0.000  
 1st Qu.:2.000       1st Qu.:2.000         1st Qu.:2.000       1st Qu.:3.000  
 Median :3.000       Median :4.000         Median :3.000       Median :4.000  
 Mean   :2.989       Mean   :3.371         Mean   :3.114       Mean   :3.424  
 3rd Qu.:4.000       3rd Qu.:4.000         3rd Qu.:4.000       3rd Qu.:4.000  
 Max.   :5.000       Max.   :5.000         Max.   :5.000       Max.   :5.000  
 Legroomservice  Baggagehandling Checkinservice   Cleanliness   
 Min.   :0.000   Min.   :1.000   Min.   :0.000   Min.   :0.000  
 1st Qu.:2.000   1st Qu.:3.000   1st Qu.:3.000   1st Qu.:3.000  
 Median :4.000   Median :4.000   Median :3.000   Median :4.000  
 Mean   :3.418   Mean   :3.664   Mean   :3.324   Mean   :3.496  
 3rd Qu.:5.000   3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000  
 Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
 Onlineboarding  DepartureDelayinMinutes ArrivalDelayinMinutes
 Min.   :0.000   Min.   :   0.00         Min.   :   0.00      
 1st Qu.:2.000   1st Qu.:   0.00         1st Qu.:   0.00      
 Median :3.000   Median :   0.00         Median :   0.00      
 Mean   :3.303   Mean   :  14.71         Mean   :  15.05      
 3rd Qu.:4.000   3rd Qu.:  12.00         3rd Qu.:  13.00      
 Max.   :5.000   Max.   :1592.00         Max.   :1584.00      
In [11]:
# Visualize a summary with different function
prettydataset<-NPREPROCESSING_prettyDataset(dataset)
prettydataset
In [12]:
# assigning a field type to each fields in Dataset

TYPE_DISCRETE     <- "DISCRETE"           # field is discrete (numeric)
TYPE_ORDINAL      <- "ORDINAL"            # field is continuous numeric
TYPE_SYMBOLIC     <- "SYMBOLIC"           # field is a string
TYPE_NUMERIC      <- "NUMERIC"            # field is initially a numeric
TYPE_IGNORE       <- "IGNORE"             # field is not encoded

# This function will differentiate between Numeric and Symbolic
field_types<-NPREPROCESSING_initialFieldType(dataset)
In [13]:
# Total number of fields
print(paste('Total columns ',ncol(dataset)))
# index of numeric fields
numeric_index<-field_types=="NUMERIC"
# index of symbolic fields
symbolic_index<-field_types=="SYMBOLIC"
# names of colums that are numeric
numeric_fields<-names(dataset[,numeric_index])
# names of colums that are symbolic 
symbolic_fields<-names(dataset[,symbolic_index])
print("**************************************************")
print(paste("Numeric Fields : ", length(numeric_fields)))
print(numeric_fields)
print("**************************************************")
print(paste("Symbolic Fields :", length(symbolic_fields)))
print(symbolic_fields)
print("**************************************************")

###############################################################################################################

print(paste("Type of attribute in Numeric Fields"))

# Statisfaction Scale Fields
sat_cols<- c("Inflightwifiservice" ,"DepartureArrivaltimeconvenient",
            "EaseofOnlinebooking","Gatelocation","Foodanddrink",
            "Onlineboarding","Seatcomfort","Inflightentertainment",
            "Onboardservice","Legroomservice","Baggagehandling",
            "Checkinservice","Cleanliness")
print(paste("Satisfy scale Fields : ",length(sat_cols)))

print(sat_cols)
print("**************************************************")
# Other numeric fields that is not satisfaction scale fields
pure_numerical_val<-c('Age', 'FlightDistance', 'DepartureDelayinMinutes','ArrivalDelayinMinutes')
time_field<-c('DepartureDelayinMinutes','ArrivalDelayinMinutes')
distance_field<-c('FlightDistance')
print(paste("others numercial Fields : ",length(pure_numerical_val)))

print(pure_numerical_val)
[1] "Total columns  22"
[1] "**************************************************"
[1] "Numeric Fields :  17"
 [1] "Age"                            "FlightDistance"                
 [3] "Seatcomfort"                    "DepartureArrivaltimeconvenient"
 [5] "Foodanddrink"                   "Gatelocation"                  
 [7] "Inflightwifiservice"            "Inflightentertainment"         
 [9] "EaseofOnlinebooking"            "Onboardservice"                
[11] "Legroomservice"                 "Baggagehandling"               
[13] "Checkinservice"                 "Cleanliness"                   
[15] "Onlineboarding"                 "DepartureDelayinMinutes"       
[17] "ArrivalDelayinMinutes"         
[1] "**************************************************"
[1] "Symbolic Fields : 5"
[1] "satisfactionv2" "Gender"         "CustomerType"   "TypeofTravel"  
[5] "Class"         
[1] "**************************************************"
[1] "Type of attribute in Numeric Fields"
[1] "Satisfy scale Fields :  13"
 [1] "Inflightwifiservice"            "DepartureArrivaltimeconvenient"
 [3] "EaseofOnlinebooking"            "Gatelocation"                  
 [5] "Foodanddrink"                   "Onlineboarding"                
 [7] "Seatcomfort"                    "Inflightentertainment"         
 [9] "Onboardservice"                 "Legroomservice"                
[11] "Baggagehandling"                "Checkinservice"                
[13] "Cleanliness"                   
[1] "**************************************************"
[1] "others numercial Fields :  4"
[1] "Age"                     "FlightDistance"         
[3] "DepartureDelayinMinutes" "ArrivalDelayinMinutes"  

This summary show that 5 fields in a dataset are symbolic and remaining are numeric¶

Symbolic fields are :-¶

*Satifactionv2 , Gender, Customer , TypeofTravel and Class*

Numeric fields are :-¶

*Age, FlightDistance, Seatcomfort, DepartureArrivaltimeconvenient, Foodanddrink, Gatelocation, Inflightwifiservice, Inflightentertainment, EaseofOnlinebooking, Onboardservice, Legroomservice, Baggagehandling, Checkinservice, Cleanliness, Onlineboarding, DepartureDelayinMinutes, and ArrivalDelayinMinutes*

Where 13 numeric fields are the columns containing satifaction level provided from a customer from a scaling 0 to 5 and other 4 numeric field are age of customer , travel distance of a flight and last two fields are time delay in minute

Data Analysis¶

Visualizing Numeric fields¶

Visualizing a correlation of numeric fields¶

In [14]:
# Visualizing a correlation of numeric fields
library(corrplot)
correlations <- cor(dataset[,numeric_fields])
corrplot(cor(dataset[,numeric_fields]), method="circle")

Visualzing a Box Plot of a fields containing statisfaction level¶

We can see that most of the fields containing statisfaction level have a good balance normal distribution but some still have a some outliers and also positive and negative skewing such in *("Onboard service"), ("Baggehandling"), ("Checkinservice") and ("Cleanliness")*

Also, The satisfaction scale is rating between 0-5 but in Baggagehandling the range of value dataset contains in that field is 1-5

In [15]:
# Box plot
par(mfrow=c(2,1))
options(repr.plot.width = 15, repr.plot.height = 10)
# satisfaction data

satis_data<-dataset[,sat_cols]

boxplot(satis_data, las = 2)

Visualizing a Historgram in in other numeric fields'¶

For visualizing other numeric fields, using Histogram are suitable for visualizing a distribution of data in those field.

In [16]:
# Box plot
# satisfaction data

other_num<-dataset[,pure_numerical_val]

par(mfrow=c(2,2))
options(repr.plot.width = 10, repr.plot.height = 10)
for (name in pure_numerical_val){
    
    hist((dataset[,name]), main=name,breaks=10,prob=TRUE)
    lines(density(dataset[,name]),
      lwd = 2,
      col = "chocolate3")
}

Histrogram show that in columns *('Age')* has a perfect normal distribution where in *('Flightdistance')* the distribution of a data is skew to the left and have small number which means the frequency of a data passenger short flight distance is more than longer flight distance. After looking through distibution of these columns, *("Age")* can be treat as ordinal value and use Linear normalization to normalize a value in a range of 0.0 to 1.0 in the processing part.

For others 2 columns, the DepatureDelayinMinutes and ArrivalDelayinMinutes are clearly have distribution skew to the left and large value of outlier in a columns. Both of these need to be treat as a Discrete value and will be preprocess using one hot encoding.

Visualize Symbolic Fields¶

Use BarChart to display a distribution of each categories in each fields¶

Barchar is suitable for visualizing distribution of a symbolic fields. For example we can cleary visualize the number of a satisfy passenger and unsatisfy

In [17]:
par(mfrow=c(1,1),pty = "m")
options(repr.plot.width = 10, repr.plot.height = 5)
for (name in symbolic_fields){
    counts <- round(table(dataset[,name])/ length(dataset[,name])*100,2)
    if (name == 'satisfactionv2'){
        name = "satisfaction"
    }
    xx<-barplot(counts, main=name,space=0.5) 
    text(x = xx, y = counts, label = paste(counts," %") , pos = 1, cex = 1.5, col = "black")  
}

Visualizing Relationship between a symbolic fields¶

Here we are using a barchart to visualize a relationship of satiscation fields on other symbolic fields.

In [18]:
library("ggplot2")
sat_class = dataset[,c('satisfactionv2','Class')]
table(sat_class)
results <- data.frame(table(sat_class))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Class)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Class \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
                         Class
satisfactionv2            Business   Eco Eco Plus
  neutral or dissatisfied    37059 82702    12484
  satisfied                  87261 33916     6338
In [19]:
library("ggplot2")
sat_gender = dataset[,c('satisfactionv2','Gender')]
table(sat_gender)
results <- data.frame(table(sat_gender))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Gender)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on gender \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
                         Gender
satisfactionv2            Female  Male
  neutral or dissatisfied  60601 71644
  satisfied                71197 56318
In [20]:
sat_type = dataset[,c('satisfactionv2','CustomerType')]
table(sat_type)
results <- data.frame(table(sat_type))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = CustomerType)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Customer Type \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
                         CustomerType
satisfactionv2            disloyal Customer Loyal Customer
  neutral or dissatisfied             36160          96085
  satisfied                           11400         116115
In [21]:
sat_type = dataset[,c('satisfactionv2','TypeofTravel')]
table(sat_type)
results <- data.frame(table(sat_type))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill =TypeofTravel)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Type of Travel \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
                         TypeofTravel
satisfactionv2            Business travel Personal Travel
  neutral or dissatisfied           74674           57571
  satisfied                        104712           22803

Visualizing Relationship between a Cusomter Satisfaction fields with fields containing Satisfaction scale¶

Here again, we are using a barchart to visualize a relationship of satiscation fields with all a fields containing a rating of customer satisfaction in each categories.

In [22]:
par(mfrow=c(2,4),pty = "m")
options(repr.plot.width = 10,epr.plot.height = 5)

temp_df<-cbind(dataset[,c('satisfactionv2','Inflightwifiservice')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill =Inflightwifiservice)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Inflightwifiservice\n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [23]:
temp_df<-cbind(dataset[,c('satisfactionv2','EaseofOnlinebooking')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill =EaseofOnlinebooking)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on EaseofOnlinebooking \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [24]:
temp_df<-cbind(dataset[,c('satisfactionv2','Foodanddrink')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Foodanddrink)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Foodanddrink \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [25]:
temp_df<-cbind(dataset[,c('satisfactionv2','Seatcomfort')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Seatcomfort)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Seatcomfort \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [26]:
temp_df<-cbind(dataset[,c('satisfactionv2','Onboardservice')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Onboardservice)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Onboardservice \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [27]:
temp_df<-cbind(dataset[,c('satisfactionv2','Baggagehandling')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Baggagehandling)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Baggagehandling \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [28]:
temp_df<-cbind(dataset[,c('satisfactionv2','Cleanliness')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Cleanliness)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Clealiness \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [29]:
temp_df<-cbind(dataset[,c('satisfactionv2','DepartureArrivaltimeconvenient')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = DepartureArrivaltimeconvenient)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on DepartureArrivaltimeconvenient \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [30]:
temp_df<-cbind(dataset[,c('satisfactionv2','Gatelocation')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Gatelocation)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Gatelocation \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [31]:
temp_df<-cbind(dataset[,c('satisfactionv2','Onlineboarding')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Onlineboarding)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Onlineboarding \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [32]:
temp_df<-cbind(dataset[,c('satisfactionv2','Inflightentertainment')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill =Inflightentertainment)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Inflightentertainment \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [33]:
temp_df<-cbind(dataset[,c('satisfactionv2','Legroomservice')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Legroomservice)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Legroomservice \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))
In [34]:
temp_df<-cbind(dataset[,c('satisfactionv2','Checkinservice')])
results <- data.frame(table(temp_df))
ggplot(data = results, aes(x = satisfactionv2, y = Freq, fill = Checkinservice)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Checkinservice \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))

Visualizing Relationship between a Cusomter Satisfaction fields with maximum occurance of a rating from all satisfation scale for each passenger¶

Looking at the a plot of relationship individually might be too difficult to understand the relationship and satisfaction rating in each categories to passenger satisfation. Instead, we count the occurance of customers' rating and take the maximum rating occur for each customer and use barchart to visualize a relationship with final satisfaction

In [35]:
sat_data=dataset[,sat_cols]

# Calculate a maximum occurance from all satisfation scale for each customer  
max_occurance<-apply(sat_data,1,function(x) names(which.max(table(x))))
In [36]:
# create dataframe of max occurance scale
max_occur<-data.frame( max_occur = max_occurance)

sat=dataset[,c("satisfactionv2")]
# combine dataframe of max occurance and final statisfaction
max_sat<-cbind(max_occur,sat)
In [37]:
results <- data.frame(table(max_sat))
ggplot(data = results, aes(x = sat, y = Freq, fill =max_occur)) +
geom_bar(stat = "identity", position = position_dodge(), alpha = 0.75)  +

  geom_text(aes(label = Freq), fontface = "bold", vjust = 1.5,
             position = position_dodge(.9), size = 4) +
  labs(x = "\n satisfaction", y = "Frequency\n", title = "\n Customer satisfaction based on Maximum Occurance of scale \n") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="red", size = 12),
        axis.title.y = element_text(face="bold", colour="red", size = 12),
        legend.title = element_text(face="bold", size = 10))

From this visualization we can see that statiscally passengers that have neutral or disstasified to a service of an airlines mostly have a maximum occurance of rating from 1 to 3. While on the other hand for a passenger with who satisfied a service has maximum occurance from 4 to 5.

Data preparation or Data preprocessing¶

Data preprocessing is a crucial process before feeding a data into a models

In [38]:
dataset
A data.frame: 259760 × 22
satisfactionv2GenderCustomerTypeAgeTypeofTravelClassFlightDistanceSeatcomfortDepartureArrivaltimeconvenientFoodanddrink⋯InflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboardingDepartureDelayinMinutesArrivalDelayinMinutes
<chr><chr><chr><int><chr><chr><int><int><int><int>⋯<int><int><int><int><int><int><int><int><int><dbl>
satisfiedFemaleLoyal Customer65Personal TravelEco 265000⋯43303532 0 0
satisfiedMale Loyal Customer47Personal TravelBusiness2464000⋯23444232310305
satisfiedFemaleLoyal Customer15Personal TravelEco 2138000⋯02334442 0 0
satisfiedFemaleLoyal Customer60Personal TravelEco 623000⋯41101413 0 0
satisfiedFemaleLoyal Customer70Personal TravelEco 354000⋯32202425 0 0
satisfiedMale Loyal Customer30Personal TravelEco 1894000⋯02545542 0 0
satisfiedFemaleLoyal Customer66Personal TravelEco 227000⋯55505553 17 15
satisfiedMale Loyal Customer10Personal TravelEco 1812000⋯02334542 0 0
satisfiedFemaleLoyal Customer56Personal TravelBusiness 73000⋯34401544 0 0
satisfiedMale Loyal Customer22Personal TravelEco 1556000⋯02245342 30 26
satisfiedFemaleLoyal Customer58Personal TravelEco 104000⋯33301235 47 48
satisfiedFemaleLoyal Customer34Personal TravelEco 3633000⋯02325252 0 0
satisfiedMale Loyal Customer62Personal TravelEco 1695000⋯05132245 0 0
satisfiedMale Loyal Customer35Personal TravelEco 1766010⋯04352324 0 0
satisfiedFemaleLoyal Customer47Personal TravelEco 84010⋯25505252 40 48
satisfiedMale Loyal Customer60Personal TravelEco 1373010⋯01341421 0 0
satisfiedFemaleLoyal Customer13Personal TravelEco 3693010⋯04441314 5 0
satisfiedFemaleLoyal Customer52Personal TravelBusiness2610010⋯21101213 0 0
satisfiedFemaleLoyal Customer55Personal TravelEco 2554010⋯12112131 0 0
satisfiedFemaleLoyal Customer28Personal TravelEco 3095010⋯03252323 0 0
satisfiedFemaleLoyal Customer 9Personal TravelEco 3305010⋯03111333 0 0
satisfiedFemaleLoyal Customer10Personal TravelEco 2090010⋯01351421 0 0
satisfiedFemaleLoyal Customer25Personal TravelEco 2122010⋯02413132 0 0
satisfiedMale Loyal Customer53Personal TravelBusiness1099010⋯31101311 0 0
satisfiedFemaleLoyal Customer16Personal TravelEco Plus1747010⋯02332432 0 0
satisfiedMale Loyal Customer30Personal TravelEco 1817010⋯04213324 0 0
satisfiedMale Loyal Customer64Personal TravelEco 1707010⋯05442325 0 0
satisfiedFemaleLoyal Customer42Personal TravelEco 470010⋯23303134 2 23
satisfiedMale Loyal Customer 9Personal TravelEco 972010⋯04433134 0 0
satisfiedFemaleLoyal Customer35Personal TravelEco 3695010⋯42234434 0 0
⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋱⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮
satisfiedMale Loyal Customer42Business travelBusiness 148452⋯55555555 0 1
satisfiedFemaleLoyal Customer46Business travelBusiness 143253⋯555553532520
satisfiedMale Loyal Customer20Business travelEco Plus 620515⋯53143355 0 0
satisfiedFemaleLoyal Customer50Business travelBusiness2399233⋯53555522 0 0
satisfiedFemaleLoyal Customer43Business travelBusiness 134422⋯53555452 0 0
satisfiedMale Loyal Customer35Business travelBusiness3640542⋯545554443147
satisfiedMale Loyal Customer53Business travelBusiness 130424⋯52555535 0 0
satisfiedFemaleLoyal Customer45Business travelBusiness 129544⋯53555454 0 0
satisfiedMale Loyal Customer53Business travelBusiness3626534⋯53555454 0 0
satisfiedMale Loyal Customer47Business travelBusiness2848335⋯535553545482
satisfiedMale Loyal Customer43Business travelBusiness2244455⋯55555535 0 0
satisfiedMale Loyal Customer18Business travelEco 2402525⋯5231325515 0
satisfiedFemaleLoyal Customer31Business travelBusiness2062535⋯55555455 035
satisfiedFemaleLoyal Customer52Business travelBusiness 114515⋯54555223 0 0
satisfiedMale Loyal Customer50Business travelBusiness1985444⋯54555354 0 0
satisfiedMale Loyal Customer47Business travelBusiness1901131⋯53555433 0 0
satisfiedMale Loyal Customer39Business travelBusiness1970513⋯515553538282
satisfiedMale Loyal Customer16Business travelEco 588515⋯51242555 0 0
satisfiedFemaleLoyal Customer48Business travelBusiness3108553⋯555555354839
satisfiedMale Loyal Customer64Business travelBusiness2062352⋯53555443 0 0
satisfiedFemaleLoyal Customer56Business travelBusiness3088455⋯525555551425
satisfiedMale Loyal Customer30Business travelBusiness2172545⋯54542555 0 0
satisfiedMale Loyal Customer39Business travelBusiness3264435⋯53555354 0 0
satisfiedFemaleLoyal Customer55Business travelBusiness 90534⋯53555354 0 0
satisfiedMale Loyal Customer12Business travelEco 1476525⋯52243455 0 0
satisfiedFemaleLoyal Customer35Business travelBusiness2592454⋯55555542 0 0
satisfiedFemaleLoyal Customer11Business travelEco 610515⋯515412558265
satisfiedMale Loyal Customer46Business travelBusiness 86515⋯51131525 0 0
satisfiedFemaleLoyal Customer35Business travelBusiness 83515⋯51442555 0 0
satisfiedFemaleLoyal Customer47Business travelBusiness3939515⋯53254555 0 0
In [39]:
# Number of empty bins to determine discrete
DISCRETE_BINS = 5
field_types_ord_dis<-NPREPROCESSING_discreteNumeric(dataset=dataset,field_types=field_types, cutoff=DISCRETE_BINS)
In [40]:
is_sat<-  ifelse(names(dataset)%in%sat_cols , "SAT", "NOTSAT")
is_time <- ifelse(names(dataset)%in%time_field,"TIME", "NOTTIME")
is_dis <- ifelse(names(dataset)%in%distance_field,'DIS','NOTDIS')
types_table<-data.frame(field=names(dataset),
                        initial=field_types,
                        types=field_types_ord_dis, 
                        is_sat=is_sat,
                        is_time=is_time,
                        is_dis = is_dis)
print(formattable::formattable(types_table))

types_table
A data.frame: 22 × 6
fieldinitialtypesis_satis_timeis_dis
<chr><chr><chr><chr><chr><chr>
satisfactionv2 SYMBOLICSYMBOLICNOTSATNOTTIMENOTDIS
Gender SYMBOLICSYMBOLICNOTSATNOTTIMENOTDIS
CustomerType SYMBOLICSYMBOLICNOTSATNOTTIMENOTDIS
Age NUMERIC ORDINAL NOTSATNOTTIMENOTDIS
TypeofTravel SYMBOLICSYMBOLICNOTSATNOTTIMENOTDIS
Class SYMBOLICSYMBOLICNOTSATNOTTIMENOTDIS
FlightDistance NUMERIC DISCRETENOTSATNOTTIMEDIS
Seatcomfort NUMERIC ORDINAL SAT NOTTIMENOTDIS
DepartureArrivaltimeconvenientNUMERIC ORDINAL SAT NOTTIMENOTDIS
Foodanddrink NUMERIC ORDINAL SAT NOTTIMENOTDIS
Gatelocation NUMERIC ORDINAL SAT NOTTIMENOTDIS
Inflightwifiservice NUMERIC ORDINAL SAT NOTTIMENOTDIS
Inflightentertainment NUMERIC ORDINAL SAT NOTTIMENOTDIS
EaseofOnlinebooking NUMERIC ORDINAL SAT NOTTIMENOTDIS
Onboardservice NUMERIC ORDINAL SAT NOTTIMENOTDIS
Legroomservice NUMERIC ORDINAL SAT NOTTIMENOTDIS
Baggagehandling NUMERIC ORDINAL SAT NOTTIMENOTDIS
Checkinservice NUMERIC ORDINAL SAT NOTTIMENOTDIS
Cleanliness NUMERIC ORDINAL SAT NOTTIMENOTDIS
Onlineboarding NUMERIC ORDINAL SAT NOTTIMENOTDIS
DepartureDelayinMinutes NUMERIC DISCRETENOTSATTIME NOTDIS
ArrivalDelayinMinutes NUMERIC DISCRETENOTSATTIME NOTDIS

Remove outlier in ordinals fields¶

The next step before starting a preprocessing a dataset is removing an outlier from ordinals fields and replace it with mean of each fields

In [41]:
OUTLIER_CONF = 0.85
# get a dataframe contains only ordinals fields
ordinals<-dataset[,which(field_types_ord_dis=='ORDINAL')]
# remove outlier and replace it with means value
ordinals<-NPREPROCESSING_outlier(ordinals=ordinals,confidence=OUTLIER_CONF)
[1] "Outlier field= Age Records= 12 Replaced with MEAN"
In [42]:
# replace an old ordinal field with new ordinal value after remove outlier
dataset[,names(ordinals)]<-ordinals
In [43]:
# function

Preprocess ordinals values¶

Satisfaction fields¶

Normalization (satisfaction fields) : $ Xnormalize = \frac{X}{5}$¶

For ordinal fields which contains a rating of a passenger satisfaction, these field will be normalize by value of 5 which is the different between maximum rating (5) and minimum rating(0). Instead of normalize by a maximum and minimum value in each colums since some fill its does not contains a minimum rating but we want to normalize in a same scale for all the field containing rating information. The range of the value after normalize will be in a range of 0.0 to 1.0.

Others Ordinal fields¶

Normalization (satisfaction fields) : $ Xnormalize = \frac{X-X\min}{X\max -X\min} $¶

For others ordinal fields, each colums will be normalize by substract the value with the minimum vaue and divide by the different between maximum and minimum values. The range of the value after normalize will be in a range of 0.0 to 1.0.

In [44]:
# call function Preprocess ordinal which will normalize a value according to type of its fields
processed_ordinal<-Preprocess_ordinal(dataset,types_table)
head(processed_ordinal)
A matrix: 6 × 14 of type dbl
AgeSeatcomfortDepartureArrivaltimeconvenientFoodanddrinkGatelocationInflightwifiserviceInflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboarding
0.79452050000.40.40.80.60.60.00.61.00.60.4
0.54794520000.60.00.40.60.80.80.80.40.60.4
0.10958900000.60.40.00.40.60.60.80.80.80.4
0.72602740000.60.60.80.20.20.00.20.80.20.6
0.86301370000.60.80.60.40.40.00.40.80.41.0
0.31506850000.60.40.00.41.00.81.01.00.80.4
In [ ]:

Preprocess Categorical (Symbolic and Discrete values)¶

For preprocessing a Symbolic and Discrete values, symbolic values will be transform from symbolic value to on hot encoding directly. And for discrete values, it will be convert to a symbolic values first then transform those value to a form of on hot encoding.

Convert Discrete values to symbolic¶

The fields *('DepatureDelayinMinutes'), ('ArrivalDelayinMinutes') and ('FlightDistance')* are discrete fields that need to be convert to symbolic value before tranforming it to one hot encoding.

For *('DepatureDelayinMinutes') and ('ArrivalDelayinMinutes')* fields, we convert a delay to 4 symbolic values

  • Value 0 for a delay time equal to 0 minutes or no delay
  • Value 1 for a delay time from 1 to 29 minutes
  • Value 2 for a delay time from 30 to 59 minutes
  • Value 3 for a delay time from 60 minutes above

For *('Flightdistance')* fiedls, according to information from commercial flight duration. The distance of a flight are classified into three categories.

  • Short Haul for distance from 600 to 800 miles or less than
  • Long Haul for distance from 2,200 to 2,600 or more than
  • Medium Haul for a distance in between Short and Long Haul which are distance from 800 to 2200 miles
In [45]:
time_data<-dataset[,time_field]
head(time_data)
time_data<-Process_timeDelay(time_data)
head(time_data)
A data.frame: 6 × 2
DepartureDelayinMinutesArrivalDelayinMinutes
<int><dbl>
1 0 0
2310305
3 0 0
4 0 0
5 0 0
6 0 0
A matrix: 6 × 2 of type chr
DepartureDelayinMinutesArrivalDelayinMinutes
00
33
00
00
00
00
In [46]:
distance_data<-dataset[, grepl(distance_field, names(dataset)), drop = FALSE]
head((distance_data))
distance_data<-Process_distance(distance_data)
head(distance_data)
A data.frame: 6 × 1
FlightDistance
<int>
1 265
22464
32138
4 623
5 354
61894
A matrix: 6 × 1 of type chr
FlightDistance
short
long
medium
short
short
medium
In [47]:
# get symbolic and discrete data 
categor_data<-dataset[,symbolic_index]



# convert a discrete value to symbolic values



# combine symbolic data with converted discrete data
categor_data<-cbind(categor_data,time_data)
categor_data<-cbind(categor_data,distance_data)
head(categor_data)
A data.frame: 6 × 8
satisfactionv2GenderCustomerTypeTypeofTravelClassDepartureDelayinMinutesArrivalDelayinMinutesFlightDistance
<chr><chr><chr><chr><chr><chr><chr><chr>
1satisfiedFemaleLoyal CustomerPersonal TravelEco 00short
2satisfiedMale Loyal CustomerPersonal TravelBusiness33long
3satisfiedFemaleLoyal CustomerPersonal TravelEco 00medium
4satisfiedFemaleLoyal CustomerPersonal TravelEco 00short
5satisfiedFemaleLoyal CustomerPersonal TravelEco 00short
6satisfiedMale Loyal CustomerPersonal TravelEco 00medium

After converted a discrete value to symbolic value then combine the data together in a same dataframe.

Transform both symbolic and discrete data to one hot encoding¶

Then we transform a dataframe containing both symbolic and discrete into a form of one hot encoding.

In [48]:
processed_catdata<-Preprocess_categorical(categor_data)
head(processed_catdata)
A data.frame: 6 × 18
satisfactionv2GenderCustomerTypeTypeofTravelClassBusinessClassEcoClassEco.PlusDepartureDelayinMinutes0DepartureDelayinMinutes1DepartureDelayinMinutes2DepartureDelayinMinutes3ArrivalDelayinMinutes0ArrivalDelayinMinutes1ArrivalDelayinMinutes2ArrivalDelayinMinutes3FlightDistancelongFlightDistancemediumFlightDistanceshort
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1010001010001000001
2000010000010001100
3010001010001000010
4010001010001000001
5010001010001000001
6000001010001000010
In [ ]:

In [ ]:

Combine processed Ordinal data and Categorical¶

After finsinih preprocess all the fields in dataset, all the processed dataset is ready for merging into one dataframe

In [49]:
processed_data<-cbind(processed_catdata, processed_ordinal)
#processed_data<-cbind(processed_data,time_data)
In [50]:
# Convert label 0 to 1 and vice versa for satisfation label to represent 1 for statisfy  and 0 for not 
processed_data$satisfactionv2 <- ifelse(as.numeric(processed_data$satisfactionv2)==0,1,0)
head(processed_data)
A data.frame: 6 × 32
satisfactionv2GenderCustomerTypeTypeofTravelClassBusinessClassEcoClassEco.PlusDepartureDelayinMinutes0DepartureDelayinMinutes1DepartureDelayinMinutes2⋯GatelocationInflightwifiserviceInflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboarding
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>⋯<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
11100010100⋯0.40.40.80.60.60.00.61.00.60.4
21000100000⋯0.60.00.40.60.80.80.80.40.60.4
31100010100⋯0.60.40.00.40.60.60.80.80.80.4
41100010100⋯0.60.60.80.20.20.00.20.80.20.6
51100010100⋯0.60.80.60.40.40.00.40.80.41.0
61000010100⋯0.60.40.00.41.00.81.01.00.80.4
In [51]:
names(processed_data)
  1. 'satisfactionv2'
  2. 'Gender'
  3. 'CustomerType'
  4. 'TypeofTravel'
  5. 'ClassBusiness'
  6. 'ClassEco'
  7. 'ClassEco.Plus'
  8. 'DepartureDelayinMinutes0'
  9. 'DepartureDelayinMinutes1'
  10. 'DepartureDelayinMinutes2'
  11. 'DepartureDelayinMinutes3'
  12. 'ArrivalDelayinMinutes0'
  13. 'ArrivalDelayinMinutes1'
  14. 'ArrivalDelayinMinutes2'
  15. 'ArrivalDelayinMinutes3'
  16. 'FlightDistancelong'
  17. 'FlightDistancemedium'
  18. 'FlightDistanceshort'
  19. 'Age'
  20. 'Seatcomfort'
  21. 'DepartureArrivaltimeconvenient'
  22. 'Foodanddrink'
  23. 'Gatelocation'
  24. 'Inflightwifiservice'
  25. 'Inflightentertainment'
  26. 'EaseofOnlinebooking'
  27. 'Onboardservice'
  28. 'Legroomservice'
  29. 'Baggagehandling'
  30. 'Checkinservice'
  31. 'Cleanliness'
  32. 'Onlineboarding'

Removing Reduntant fields¶

After merging a dataset, we shall check a redundant value in a dataset and remove it. Since

In [52]:
# ************************************************
# Q14: Are any of the fields redundant?
nrow_before<-paste('Number of rows before removing redundant : ', nrow(processed_data))
ncol_before<-paste('Number of columns before removing redundant :', ncol(processed_data))
processed_data<-NPREPROCESSING_redundantFields(dataset=processed_data,cutoff=OUTLIER_CONF)
nrow_after<-paste('Number of rows after removing redundant : ' , nrow(processed_data))
ncol_after<-paste("Number of columns after removing redundant : ", ncol(processed_data))

print(nrow_before)
print(nrow_after)
print(ncol_before)
print(ncol_after)
[1] "Before redundancy check Fields= 32"
Warning message in text.default(pos.xlabel[, 1], pos.xlabel[, 2], newcolnames, srt = tl.srt, :
“"cl.lim" is not a graphical parameter”
Warning message in text.default(pos.ylabel[, 1], pos.ylabel[, 2], newrownames, col = tl.col, :
“"cl.lim" is not a graphical parameter”
Warning message in title(title, ...):
“"cl.lim" is not a graphical parameter”
[1] "Following fields are correlated"
                         row col
ClassEco                   6   5
ClassBusiness              5   6
ArrivalDelayinMinutes3    15  11
DepartureDelayinMinutes3  11  15
[1] "ClassEco ~ ClassBusiness"
[1] "ClassBusiness ~ ClassEco"
[1] "ArrivalDelayinMinutes3 ~ DepartureDelayinMinutes3"
[1] "DepartureDelayinMinutes3 ~ ArrivalDelayinMinutes3"
[1] "Removing the following fields"
[1] "ClassBusiness"            "DepartureDelayinMinutes3"
[1] "Number of rows before removing redundant :  259760"
[1] "Number of rows after removing redundant :  259760"
[1] "Number of columns before removing redundant : 32"
[1] "Number of columns after removing redundant :  30"

After running a function NPREPROCESSING_redundantFields to check a redundant. It found that there are two fields which are redundant that are ClassBusiness and DepartureDelayMinutes3. So these column are remove

In [53]:
head(processed_data)
A data.frame: 6 × 30
satisfactionv2GenderCustomerTypeTypeofTravelClassEcoClassEco.PlusDepartureDelayinMinutes0DepartureDelayinMinutes1DepartureDelayinMinutes2ArrivalDelayinMinutes0⋯GatelocationInflightwifiserviceInflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboarding
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>⋯<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
11100101001⋯0.40.40.80.60.60.00.61.00.60.4
21000000000⋯0.60.00.40.60.80.80.80.40.60.4
31100101001⋯0.60.40.00.40.60.60.80.80.80.4
41100101001⋯0.60.60.80.20.20.00.20.80.20.6
51100101001⋯0.60.80.60.40.40.00.40.80.41.0
61000101001⋯0.60.40.00.41.00.81.01.00.80.4
In [54]:
rm(list=setdiff(ls(), c("processed_data","combined")))
ls()
  1. 'combined'
  2. 'processed_data'
In [ ]:

Models and Evaluation¶

Models¶

For a machine learning that we are going to use are

Supervised Learning¶

  • *Binomial Logistic Regression*
  • *Decison Tree*
  • *Random Forest Classifier*
  • *K-nearest neighbour*
  • *Neural Network*

Unsupervised Leanring¶

  • *K-means Clustering*

Data¶

The data was spilt in two ways.

Evaluations Metrics¶

Since this task are classification problems, we are going these following as our evaluation metrics and graph to the performance of a classification model

  • *Accuracy*
    • To measure which model can identify the realationship and correlation in each fields in dataset.
  • *Loss* (Binary Cross Entrophy)
    • The value indicating a the quality of a model prediction.
  • *Precsion and Recall* (which derived from True Positive,False Positive, True Negative, and False Negative)
    • The number measure the quality of a prediction
  • *F1-score*
    • It is a combination of precison and recall into singel values, whihc use for comparing a performance of different classifiers.
  • *Sensitivity (true positive rate)*
    • It also knowns as True positive rate, it is used for measuring the percentage of labels predict to be postive which is correctly identify.
  • *Specificity (true negative rate)*
    • It also knowns as True negative rate, it is used fro measuring the percentage of labels predict to be negative which is correctly identify.
  • *ROC Curve (receiver operating characteristic curve)*
    • It is a graph showing a classification performance of a model at various threshold to visualize a tradeoff of sensitivity and specificity.
  • *AUC (area under the curve)*
    • It is an entire area under the ROC curve which use to measure a an ability of a model to distinguish between each class

Logistic Regression¶

For logistic Regression is a linear classifier which is suited our task classify between *satisfied* and *neutral or dissatisfied* airline passenger. The outputs of logistic regression is a probabilities between 0.0 to 1.0 for predict whether a passenger is satisfy with an airline service or not. After that determined a threshold to turn propability into predicting classes.

Data¶

The dataset is split into 2 sets which are training set(70%) and testing set(30%)

In [55]:
# Load utilities function for logistic regression

source('utils.R')
options(repr.plot.width = 10, repr.plot.height = 10)

myModelFormula<-function(dataset,fieldNameOutput){
     
  inputs<-paste(names(dataset)[which(names(dataset)!=fieldNameOutput)],collapse = "+")

  output<-paste(fieldNameOutput,"~")

  formular=as.formula(paste(output,inputs))

  return(formular)

} 

LogisticRegressionModel<-function(training_data,testing_data,plot=TRUE,output="satisfactionv2"){

formular<-myModelFormula(training_data,output)
# print("********Formula for Logistic Refression**********")
# print(formular)
# print("*************************************************")

# Logsitic Regression Model
logisticModel<-stats::glm(formular,data=training_data,family=quasibinomial)

    
    
y_train<-training_data[,OUTPUT_FIELD]


# Predict an satisfaction field for testing data (output is probabilites 0-1)
y_pred<- predict(logisticModel,testing_data,type="response")

gt<-testing_data[,OUTPUT_FIELD]

    
title<-'Logistic Regression'

results<-NdetermineThreshold(test_expected=gt,
                                test_predicted=y_pred,
                                plot=plot,
                                title=title)
#results<-eval_model(y_pred,gt,0.60)


    
    
return(list("model" = logisticModel,
       "result" = results)
           )
}

#Selecet Field for Logistic regression

OUTPUT_FIELD = "satisfactionv2"
seed= 123
set.seed(seed)
select_field<-c('satisfactionv2',
               'Gender',
                'CustomerType',
                'TypeofTravel',

                'ClassEco',
                'ClassEco.Plus',
                'DepartureDelayinMinutes0',
                'DepartureDelayinMinutes1',
                'DepartureDelayinMinutes2',

                'ArrivalDelayinMinutes0',
                'ArrivalDelayinMinutes1',
                'ArrivalDelayinMinutes2',
                'ArrivalDelayinMinutes3',
                'FlightDistancelong',
                'FlightDistancemedium',
                'FlightDistanceshort',
                'Age',
                'Seatcomfort',
                'DepartureArrivaltimeconvenient',
                'Foodanddrink',
                'Gatelocation',
                'Inflightwifiservice',
                'Inflightentertainment',
               'EaseofOnlinebooking',
                 'Onboardservice',
                 'Legroomservice',
                 'Baggagehandling',
                 'Checkinservice',
                 'Cleanliness',
               'Onlineboarding'
               )

print("Select Fields")
 print((select_field))


selected_data<-processed_data[,select_field]

selected_data<-selected_data[sample(nrow(selected_data)),]
training_records<-round(nrow(selected_data)*(70/100))
training_data <- selected_data[1:training_records,]
testing_data <- selected_data[-(1:training_records),]

print(paste("Number of Training",nrow(training_data)))
print(paste("Number of Testing",nrow(testing_data)))
head(training_data)
[1] "Select Fields"
 [1] "satisfactionv2"                 "Gender"                        
 [3] "CustomerType"                   "TypeofTravel"                  
 [5] "ClassEco"                       "ClassEco.Plus"                 
 [7] "DepartureDelayinMinutes0"       "DepartureDelayinMinutes1"      
 [9] "DepartureDelayinMinutes2"       "ArrivalDelayinMinutes0"        
[11] "ArrivalDelayinMinutes1"         "ArrivalDelayinMinutes2"        
[13] "ArrivalDelayinMinutes3"         "FlightDistancelong"            
[15] "FlightDistancemedium"           "FlightDistanceshort"           
[17] "Age"                            "Seatcomfort"                   
[19] "DepartureArrivaltimeconvenient" "Foodanddrink"                  
[21] "Gatelocation"                   "Inflightwifiservice"           
[23] "Inflightentertainment"          "EaseofOnlinebooking"           
[25] "Onboardservice"                 "Legroomservice"                
[27] "Baggagehandling"                "Checkinservice"                
[29] "Cleanliness"                    "Onlineboarding"                
[1] "Number of Training 181832"
[1] "Number of Testing 77928"
A data.frame: 6 × 30
satisfactionv2GenderCustomerTypeTypeofTravelClassEcoClassEco.PlusDepartureDelayinMinutes0DepartureDelayinMinutes1DepartureDelayinMinutes2ArrivalDelayinMinutes0⋯GatelocationInflightwifiserviceInflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboarding
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>⋯<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1827350111100100⋯0.40.60.80.60.40.80.40.60.80.6
1889420011001001⋯0.40.81.00.80.60.60.80.61.00.8
1340580000100001⋯0.40.20.60.20.60.80.80.80.80.8
1240221101001001⋯1.01.01.01.01.00.81.00.40.21.0
1609970000100100⋯0.80.61.00.60.80.80.60.81.00.6
2263181101001000⋯0.20.20.80.20.80.80.80.80.80.8
In [56]:
# Train Logistic Model
Lgm<-LogisticRegressionModel(training_data,testing_data,plot=TRUE,OUTPUT_FIELD)
result<-Lgm$result
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”

Visualize Threshold performance plot and ROC curve¶

Threshold performance plot¶

The best threshold for a model is detemined from a performance plot to get the maximum of correct prediction after converting a probablitlies to a labels of classes

ROC curve¶

The Roc curve show a performance of a classifer with a area under a curve of value 0.9 on a test dataset

Logistic Curve¶

This show that the suitable threshold is between 0.49-0.51 since the data points represent a satisfied passenger( blue point) seem to be moslty fit on a curve above 0.5 on a Y axis which represent propability. Where on the other hand for neutral or dissatisfied passenger (red point) are more likey to locate below a propabilites of 0.5

In [57]:
options(repr.plot.width = 10, repr.plot.height = 8)
y_train<-training_data[,OUTPUT_FIELD]
LogisticCurve_plot(Lgm$model,y_train)
[1] "Logistic Plot"
In [ ]:

Confusion matrix¶

Confusion shows a number of corret prediction *(True positive for predicting a passenger who is satisfied correctly and True Negative for predict neutral or dissatisfactied correctly)* and wrong prediction *( False Positive for predicting passenger that neutral or dissatisfied to be satisfeid and False negative for predicting neutral or dissatisfied as satisfied)*

In [58]:
ConfusionMatrix_plot(result$gt,result$pred_labels)
accuracy<-result$acc
precision<-result$pgood
recall<-result$TPR
auc <- result$AUC
f1_score <- F1_score(precision,recall)
loss <- BCE_loss(result$gt,result$proba) 
threshold<- result$threshold
print(paste("ACCURACY : ", accuracy,
            ", PRECISION: ", precision,
            ", RECALL: ", recall, 
            ", AUC : ",auc,
            "F1 score:", f1_score,
            "Loss :", loss,
            "Threshold :", threshold))
[1] "ACCURACY :  82.974027307258 , PRECISION:  83.5864798374158 , RECALL:  81.4264874439929 , AUC :  0.902687397003121 F1 score: 82.4923466694817 Loss : 0.392690750739928 Threshold : 0.49"

The result from Confusion matrix show us that logistic regression show that this model can distinguish between

Visualize a coefficient of Logistic Regression¶

By visualize a value of coefficent or a weight using a bar chart, we can understand which fields of data have an impact on a model for prediciting which passenger is *satisfied* and *neutral or dissatisfied*

In [59]:
LogisticCoeff_plot(Lgm$model)

The bar chart show that the field with coeffient with neagtive value such as *('CustomerType'),('DepatureArrivalTimeconvient'),('Food and Drink') and ('Age')* have an impact on a model for predicting a customer who is neural or dissatisfied.

And for a field which have clearly an impact on predicting a satisfied passnger are *('Inflightentertainment'),('Onlineboarding'),('Checkinservice'),('TypeofTravel'),('Onboardservice') and ('legroomservice')* which have a high value of positive coefficeint. For other fields that have positive value of coefficents such as ("Gatelocation"),("Baggagehandling") and ("Gender") have a very less impact on a model prediction which can be remove to reduce the complexity of a data feeding to a model which is called Dimisionality reduciton.

Dimesionality reduction¶

In [60]:
#Selecet Field for Logistic regression

OUTPUT_FIELD = "satisfactionv2"
seed= 123
set.seed(seed)
select_field<-c('satisfactionv2',
              'Gender',
                'CustomerType',
                'TypeofTravel',

                'ClassEco',
                'ClassEco.Plus',
#                 'DepartureDelayinMinutes0',
#                 'DepartureDelayinMinutes1',
#                 'DepartureDelayinMinutes2',

#                 'ArrivalDelayinMinutes0',
#                 'ArrivalDelayinMinutes1',
#                 'ArrivalDelayinMinutes2',
#                 'ArrivalDelayinMinutes3',
#                 'FlightDistancelong',
#                 'FlightDistancemedium',
#                 'FlightDistanceshort',
#                'Age',
                'Seatcomfort',
                'DepartureArrivaltimeconvenient',
                'Foodanddrink',
#                'Gatelocation',
                'Inflightwifiservice',
                'Inflightentertainment',
               'EaseofOnlinebooking',
                 'Onboardservice',
                 'Legroomservice',
                 'Baggagehandling',
                 'Checkinservice',
#                 'Cleanliness',
               'Onlineboarding'
               )

print("Select Fields")
 print((select_field))


selected_data<-processed_data[,select_field]

selected_data<-selected_data[sample(nrow(selected_data)),]
training_records<-round(nrow(selected_data)*(70/100))
training_data <- selected_data[1:training_records,]
testing_data <- selected_data[-(1:training_records),]

print(paste("Number of Training",nrow(training_data)))
print(paste("Number of Testing",nrow(testing_data)))
head(training_data)
[1] "Select Fields"
 [1] "satisfactionv2"                 "Gender"                        
 [3] "CustomerType"                   "TypeofTravel"                  
 [5] "ClassEco"                       "ClassEco.Plus"                 
 [7] "Seatcomfort"                    "DepartureArrivaltimeconvenient"
 [9] "Foodanddrink"                   "Inflightwifiservice"           
[11] "Inflightentertainment"          "EaseofOnlinebooking"           
[13] "Onboardservice"                 "Legroomservice"                
[15] "Baggagehandling"                "Checkinservice"                
[17] "Onlineboarding"                
[1] "Number of Training 181832"
[1] "Number of Testing 77928"
A data.frame: 6 × 17
satisfactionv2GenderCustomerTypeTypeofTravelClassEcoClassEco.PlusSeatcomfortDepartureArrivaltimeconvenientFoodanddrinkInflightwifiserviceInflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceOnlineboarding
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1827350111100.80.60.80.60.80.60.40.80.40.60.6
1889420011000.20.81.00.81.00.80.60.60.80.60.8
1340580000100.80.80.20.20.60.20.60.80.80.80.8
1240221101001.01.00.81.01.01.01.00.81.00.41.0
1609970000101.01.01.00.61.00.60.80.80.60.80.6
2263181101000.80.21.00.20.80.20.80.80.80.80.8
In [61]:
# Train Logistic Model
Lgm<-LogisticRegressionModel(training_data,testing_data,plot=TRUE,OUTPUT_FIELD)
result<-Lgm$result
In [62]:
ConfusionMatrix_plot(result$gt,result$pred_labels)
accuracy<-result$acc
precision<-result$pgood
recall<-result$TPR
auc <- result$AUC
f1_score <- F1_score(precision,recall)
loss <- BCE_loss(result$gt,result$proba) 
threshold<- result$threshold
print(paste("ACCURACY : ", accuracy,
            ", PRECISION: ", precision,
            ", RECALL: ", recall, 
            ", AUC : ",auc,
            "F1 score:", f1_score,
            "Loss :", loss,
            "Threshold :", threshold))
[1] "ACCURACY :  83.0086746740581 , PRECISION:  84.3004064707969 , RECALL:  80.4991143065541 , AUC :  0.900282465820729 F1 score: 82.3559197814645 Loss : 0.398235649933734 Threshold : 0.51"
In [63]:
LogisticCoeff_plot(Lgm$model)
In [64]:
# Add result to dataFrame
df <-unlist(result[-which(names(result) %in%c("pred_labels","gt","proba"))])
#allResults<-
allResults<-data.frame(LogisticRegression=unlist(df))
allResults
A data.frame: 14 × 1
LogisticRegression
<dbl>
TP3.090200e+04
FN7.486000e+03
FP5.755000e+03
TN3.378500e+04
F18.235592e+01
acc8.300867e+01
pgood8.430041e+01
pbad8.186136e+01
FPR1.455488e+01
TPR8.049911e+01
TNR8.544512e+01
MCC6.605291e-01
threshold5.100000e-01
AUC9.002825e-01

K-FOLD Cross validation on Logistic regression¶

To prevent an overfitting during a training, a team decide to experiment a model training wiht K-fold cross validation to ensure the generalization of a model.

In [65]:
# K-FOLD
KFOLDS          <- 10  # Number of folded experiments
dataset<-stratifiedDataset(processed_data)
measures<-runExperiment(dataset = dataset,FUN = LogisticRegressionModel)
[1] "FOLD : , 1"
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”
[1] "FOLD : , 2"
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”
[1] "FOLD : , 3"
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”
[1] "FOLD : , 4"
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”
[1] "FOLD : , 5"
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”
[1] "FOLD : , 6"
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”
[1] "FOLD : , 7"
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”
[1] "FOLD : , 8"
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”
[1] "FOLD : , 9"
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”
[1] "FOLD : , 10"
Warning message in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
“prediction from a rank-deficient fit may be misleading”
In [66]:
# Display a result of a model using K-fold crossvalidation
print("Result of each Fold")
data.frame(fold=(t(measures$allresults)))
print("Average result of each Fold")
t(measures$means)
[1] "Result of each Fold"
A data.frame: 14 × 10
fold.1fold.2fold.3fold.4fold.5fold.6fold.7fold.8fold.9fold.10
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
TP1.029000e+041.029300e+041.025200e+041.026100e+041.039900e+041.040800e+041.048900e+041.031200e+041.049200e+041.026200e+04
FN2.462000e+032.459000e+032.500000e+032.491000e+032.353000e+032.343000e+032.262000e+032.439000e+032.259000e+032.489000e+03
FP1.803000e+031.862000e+031.815000e+031.831000e+032.004000e+032.001000e+032.132000e+031.831000e+032.152000e+031.853000e+03
TN1.142200e+041.136300e+041.141000e+041.139400e+041.122100e+041.122300e+041.109200e+041.139300e+041.107200e+041.137100e+04
F18.283357e+018.265146e+018.261413e+018.260345e+018.267939e+018.273450e+018.268170e+018.284727e+018.263044e+018.253841e+01
acc8.358163e+018.336605e+018.338915e+018.336221e+018.322747e+018.327623e+018.308373e+018.356112e+018.301829e+018.328393e+01
pgood8.509055e+018.468120e+018.495898e+018.485776e+018.384262e+018.387461e+018.310752e+018.492135e+018.298007e+018.470491e+01
pbad8.226736e+018.220952e+018.202732e+018.205978e+018.266539e+018.272888e+018.306126e+018.236698e+018.305453e+018.204185e+01
FPR1.363327e+011.407940e+011.372401e+011.384499e+011.515312e+011.513158e+011.612220e+011.384604e+011.627344e+011.401240e+01
TPR8.069322e+018.071675e+018.039523e+018.046581e+018.154799e+018.162497e+018.226021e+018.087209e+018.228374e+018.047996e+01
TNR8.636673e+018.592060e+018.627599e+018.615501e+018.484688e+018.486842e+018.387780e+018.615396e+018.372656e+018.598760e+01
MCC6.720877e-016.676392e-016.682858e-016.676901e-016.645142e-016.654842e-016.615339e-016.715706e-016.602245e-016.660701e-01
threshold5.200000e-015.100000e-015.200000e-015.200000e-015.000000e-015.000000e-014.800000e-015.100000e-014.800000e-015.200000e-01
AUC9.046909e-019.015734e-019.036265e-019.019558e-019.044649e-019.043099e-019.022971e-019.044582e-019.032176e-019.033666e-01
[1] "Average result of each Fold"
A matrix: 1 × 14
TPFNFPTNF1accpgoodpbadFPRTPRTNRMCCthresholdAUC
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
10345240519281129682.6883.3184.382.4514.5881.1385.420.670.510.9

After the training is done, we can visualize at the result of each fold during training. We can see that after applying *K-fold cross validation, The number of True positive, False positive, True negative and False Negative* is small than a number in a result of model without *K-fold cross validation. This is because when the K-Fold is apply, it divide a dataset smaller number of group depending on the value of K. Hence the number of testing set use for evaluating a result get split into a smaller number of dataset.

But for other metrics such as *F1 score, Accuracy and Area Under the curve(AUC)* is very close to the result from model without *K-Fold. As a conclusion applying K-fold cross-validation* to logistic regression on passenger statisfaction dataset doesn not result in huge different but it does ensure that the model that trained does generalized well.

In [67]:
# Create a data frame to compare results from different experiments
allResults<-cbind(allResults,data.frame(Logistic_with_Kfold=unlist(measures$means)))
allResults
A data.frame: 14 × 2
LogisticRegressionLogistic_with_Kfold
<dbl><dbl>
TP3.090200e+0410345.00
FN7.486000e+03 2405.00
FP5.755000e+03 1928.00
TN3.378500e+0411296.00
F18.235592e+01 82.68
acc8.300867e+01 83.31
pgood8.430041e+01 84.30
pbad8.186136e+01 82.45
FPR1.455488e+01 14.58
TPR8.049911e+01 81.13
TNR8.544512e+01 85.42
MCC6.605291e-01 0.67
threshold5.100000e-01 0.51
AUC9.002825e-01 0.90
In [ ]:

In [ ]:

Decision Tree¶

Decsion Tree is a tree based algorithms which doesn't need dataset to be linear seperable. It can handle a data with a missing values and outlier. It can be used in both classification and regression.

It can work well with both linear seperable and non seperable it is suitable for classifying a satifation of customer. Since, passenger dataset contains complex relationshi between each fields such as fields containg rating scale of cutomer in each categories.

In [68]:
# Select field for Decsion Tree
OUTPUT_FIELD = "satisfactionv2"
seed= 123
set.seed(seed)
select_field<-c('satisfactionv2',
               'Gender',
                'CustomerType',
                'TypeofTravel',
             
                'ClassEco',
                'ClassEco.Plus',
                'DepartureDelayinMinutes0',
                'DepartureDelayinMinutes1',
                'DepartureDelayinMinutes2',
             
                'ArrivalDelayinMinutes0',
                'ArrivalDelayinMinutes1',
                'ArrivalDelayinMinutes2',
                'ArrivalDelayinMinutes3',
                'Age',
                'Seatcomfort',
                'DepartureArrivaltimeconvenient',
                'Foodanddrink',
                'Gatelocation',
                'Inflightwifiservice',
                'Inflightentertainment',
               'EaseofOnlinebooking',
                 'Onboardservice',
                 'Legroomservice',
                 'Baggagehandling',
                 'Checkinservice',
                 'Cleanliness',
               'Onlineboarding'
               )

print("Select Fields")
 print((select_field))


selected_data<-processed_data[,select_field]

selected_data<-selected_data[sample(nrow(selected_data)),]
training_records<-round(nrow(selected_data)*(70/100))
training_data <- selected_data[1:training_records,]
testing_data <- selected_data[-(1:training_records),]

print(paste("Number of Training",nrow(training_data)))
print(paste("Number of Testing",nrow(testing_data)))
head(training_data)

DecisionTree<-function(training_data, testing_data,plot=TRUE,output_field="satisfactionv2"){

# Training data
x_train<-training_data[,names(training_data [-which(names(training_data)%in% output_field)])]
y_train<-training_data[,output_field]
# Testing data
x_test<-testing_data[,names(training_data [-which(names(testing_data)%in% output_field)])]
y_test<-testing_data[,output_field]


DTmodel<-C50::C5.0(x=x_train,
                  y=factor(y_train),
                  ,
                  trials=1)
    
#y_pred<-predict()
    
    
class_proba<-predict(DTmodel, newdata=x_test, type="prob")

# Get the column index with the class label
classIndex<-which(as.numeric(colnames(class_proba))==1)

# Get the probabilities for classifying the satisfaction
y_proba<-class_proba[,classIndex]


title<-'Decision Tree'
results<-NdetermineThreshold(test_expected=y_test,
                                test_predicted=y_proba,
                                plot=plot,
                                title=title)


    
return(list("model" = DTmodel, "result" = results))
}
[1] "Select Fields"
 [1] "satisfactionv2"                 "Gender"                        
 [3] "CustomerType"                   "TypeofTravel"                  
 [5] "ClassEco"                       "ClassEco.Plus"                 
 [7] "DepartureDelayinMinutes0"       "DepartureDelayinMinutes1"      
 [9] "DepartureDelayinMinutes2"       "ArrivalDelayinMinutes0"        
[11] "ArrivalDelayinMinutes1"         "ArrivalDelayinMinutes2"        
[13] "ArrivalDelayinMinutes3"         "Age"                           
[15] "Seatcomfort"                    "DepartureArrivaltimeconvenient"
[17] "Foodanddrink"                   "Gatelocation"                  
[19] "Inflightwifiservice"            "Inflightentertainment"         
[21] "EaseofOnlinebooking"            "Onboardservice"                
[23] "Legroomservice"                 "Baggagehandling"               
[25] "Checkinservice"                 "Cleanliness"                   
[27] "Onlineboarding"                
[1] "Number of Training 181832"
[1] "Number of Testing 77928"
A data.frame: 6 × 27
satisfactionv2GenderCustomerTypeTypeofTravelClassEcoClassEco.PlusDepartureDelayinMinutes0DepartureDelayinMinutes1DepartureDelayinMinutes2ArrivalDelayinMinutes0⋯GatelocationInflightwifiserviceInflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboarding
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>⋯<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1827350111100100⋯0.40.60.80.60.40.80.40.60.80.6
1889420011001001⋯0.40.81.00.80.60.60.80.61.00.8
1340580000100001⋯0.40.20.60.20.60.80.80.80.80.8
1240221101001001⋯1.01.01.01.01.00.81.00.40.21.0
1609970000100100⋯0.80.61.00.60.80.80.60.81.00.6
2263181101001000⋯0.20.20.80.20.80.80.80.80.80.8
In [69]:
DT<-DecisionTree(training_data,testing_data,plot=TRUE,OUTPUT_FIELD)
result<-DT$result
In [ ]:

Visualize Threshold performance plot and ROC curve¶

Threshold performance plot¶

The best threshold for a model is detemined from a performance plot to get the maximum of correct prediction after converting a probablitlies to a labels of classes at threshold value of 0.43

ROC curve¶

The Roc curve show a performance of a classifer with a area under a curve of value 0.98 on a test dataset which is much higer compare to the area under curve of logistic regression. Also values of specificty is lower and sensitivity is higher than logistic regression

In [70]:
options(repr.plot.width = 10, repr.plot.height = 8)
ConfusionMatrix_plot(result$gt,result$pred_labels)

accuracy<-result$acc
precision<-result$pgood
recall<-result$TPR
auc <- result$AUC
f1_score <- F1_score(precision,recall)
loss <- BCE_loss(result$gt,result$proba)
threshold<- result$threshold
print(paste("ACCURACY : ", accuracy,
            ", PRECISION: ", precision,
            ", RECALL: ", recall, 
            ", AUC : ",auc,
            "F1 score:", f1_score,
            "Loss :", loss,
             "Threshold :", threshold))
[1] "ACCURACY :  93.9379940457859 , PRECISION:  94.7611956174875 , RECALL:  92.8258830884651 , AUC :  0.982977647394342 F1 score: 93.7835561638067 Loss : 0.159224110765153 Threshold : 0.42"

The confusion matrix of Decision Tree clearly shown a different of number in of correct and incorrect prediction simialr to logistice regression but with huge different in number of False Negative.

The Final accuracy for Decison Tree model is 94 % which is very high compare to logistice regression. These proves that passenger dataset contains a complex relationship between each field, the information in dataset tends to non-linear seperable which is the reason why tree based model outperform liner classifier like logistic regression

Importance Variable (Fields) in Decison Tree¶

In [71]:
TreeCoeff_plot(DT$model)

Dimensionality Reduction¶

In [72]:
# Select field for Decsion Tree
OUTPUT_FIELD = "satisfactionv2"
seed= 123
set.seed(seed)
select_field<-c('satisfactionv2',
               'Gender',
                'CustomerType',
                'TypeofTravel',
             
                'ClassEco',
                'ClassEco.Plus',
#                 'DepartureDelayinMinutes0',
#                 'DepartureDelayinMinutes1',
#                 'DepartureDelayinMinutes2',
             
#                 'ArrivalDelayinMinutes0',
#                 'ArrivalDelayinMinutes1',
#                 'ArrivalDelayinMinutes2',
#                 'ArrivalDelayinMinutes3',
#                'Age',
                'Seatcomfort',
                'DepartureArrivaltimeconvenient',
                'Foodanddrink',
                 'Gatelocation',
                'Inflightwifiservice',
                'Inflightentertainment',
               'EaseofOnlinebooking',
                 'Onboardservice',
                 'Legroomservice',
                 'Baggagehandling',
                 'Checkinservice',
                 'Cleanliness',
               'Onlineboarding'
               )

print("Select Fields")
 print((select_field))


selected_data<-processed_data[,select_field]

selected_data<-selected_data[sample(nrow(selected_data)),]
training_records<-round(nrow(selected_data)*(70/100))
training_data <- selected_data[1:training_records,]
testing_data <- selected_data[-(1:training_records),]

print(paste("Number of Training",nrow(training_data)))
print(paste("Number of Testing",nrow(testing_data)))
head(training_data)
[1] "Select Fields"
 [1] "satisfactionv2"                 "Gender"                        
 [3] "CustomerType"                   "TypeofTravel"                  
 [5] "ClassEco"                       "ClassEco.Plus"                 
 [7] "Seatcomfort"                    "DepartureArrivaltimeconvenient"
 [9] "Foodanddrink"                   "Gatelocation"                  
[11] "Inflightwifiservice"            "Inflightentertainment"         
[13] "EaseofOnlinebooking"            "Onboardservice"                
[15] "Legroomservice"                 "Baggagehandling"               
[17] "Checkinservice"                 "Cleanliness"                   
[19] "Onlineboarding"                
[1] "Number of Training 181832"
[1] "Number of Testing 77928"
A data.frame: 6 × 19
satisfactionv2GenderCustomerTypeTypeofTravelClassEcoClassEco.PlusSeatcomfortDepartureArrivaltimeconvenientFoodanddrinkGatelocationInflightwifiserviceInflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboarding
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1827350111100.80.60.80.40.60.80.60.40.80.40.60.80.6
1889420011000.20.81.00.40.81.00.80.60.60.80.61.00.8
1340580000100.80.80.20.40.20.60.20.60.80.80.80.80.8
1240221101001.01.00.81.01.01.01.01.00.81.00.40.21.0
1609970000101.01.01.00.80.61.00.60.80.80.60.81.00.6
2263181101000.80.21.00.20.20.80.20.80.80.80.80.80.8
In [73]:
DT<-DecisionTree(training_data,testing_data,plot=TRUE,OUTPUT_FIELD)
result<-DT$result
In [74]:
options(repr.plot.width = 10, repr.plot.height = 8)
ConfusionMatrix_plot(result$gt,result$pred_labels)

accuracy<-result$acc
precision<-result$pgood
recall<-result$TPR
auc <- result$AUC
f1_score <- F1_score(precision,recall)
loss <- BCE_loss(result$gt,result$proba)
threshold<- result$threshold
print(paste("ACCURACY : ", accuracy,
            ", PRECISION: ", precision,
            ", RECALL: ", recall, 
            ", AUC : ",auc,
            "F1 score:", f1_score,
            "Loss :", loss,
             "Threshold :", threshold))
[1] "ACCURACY :  93.9007802073709 , PRECISION:  94.6099366030929 , RECALL:  92.9118474523289 , AUC :  0.983088001664342 F1 score: 93.753203569598 Loss : 0.159883970190253 Threshold : 0.42"
In [75]:
TreeCoeff_plot(DT$model)
In [ ]:

In [ ]:

In [76]:
df <-unlist(result[-which(names(result) %in%c("pred_labels","gt","proba"))])
#allResults<-
allResults<-cbind(allResults,data.frame(DecisionTree=unlist(df)))
allResults
A data.frame: 14 × 3
LogisticRegressionLogistic_with_KfoldDecisionTree
<dbl><dbl><dbl>
TP3.090200e+0410345.003.566700e+04
FN7.486000e+03 2405.002.721000e+03
FP5.755000e+03 1928.002.032000e+03
TN3.378500e+0411296.003.750800e+04
F18.235592e+01 82.689.375320e+01
acc8.300867e+01 83.319.390078e+01
pgood8.430041e+01 84.309.460994e+01
pbad8.186136e+01 82.459.323622e+01
FPR1.455488e+01 14.585.139100e+00
TPR8.049911e+01 81.139.291185e+01
TNR8.544512e+01 85.429.486090e+01
MCC6.605291e-01 0.678.780945e-01
threshold5.100000e-01 0.514.200000e-01
AUC9.002825e-01 0.909.830880e-01

Decision Tree with K-Fold Cross Validation¶

In [77]:
KFOLDS <- 10  # Number of folded experiments
dataset<-stratifiedDataset(processed_data)
measures<-runExperiment(dataset = dataset,FUN = DecisionTree)
[1] "FOLD : , 1"
[1] "FOLD : , 2"
[1] "FOLD : , 3"
[1] "FOLD : , 4"
[1] "FOLD : , 5"
[1] "FOLD : , 6"
[1] "FOLD : , 7"
[1] "FOLD : , 8"
[1] "FOLD : , 9"
[1] "FOLD : , 10"
In [78]:
data.frame(t(measures$allresults))

data.frame(measures$means)
A data.frame: 14 × 10
X1X2X3X4X5X6X7X8X9X10
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
TP1.200800e+041.191600e+041.194100e+041.194500e+041.192600e+041.196300e+041.196500e+041.191300e+041.191200e+041.191500e+04
FN7.440000e+028.360000e+028.110000e+028.070000e+028.260000e+027.880000e+027.860000e+028.380000e+028.390000e+028.360000e+02
FP6.570000e+026.080000e+026.140000e+026.410000e+026.420000e+026.390000e+026.490000e+025.880000e+026.290000e+026.430000e+02
TN1.256800e+041.261700e+041.261100e+041.258400e+041.258300e+041.258500e+041.257500e+041.263600e+041.259500e+041.258100e+04
F19.448794e+019.428707e+019.436915e+019.428526e+019.420221e+019.437147e+019.434260e+019.435292e+019.419579e+019.415623e+01
acc9.460677e+019.444124e+019.451438e+019.442584e+019.434885e+019.450626e+019.447546e+019.451011e+019.434841e+019.430606e+01
pgood9.481248e+019.514532e+019.510952e+019.490704e+019.489179e+019.492938e+019.485492e+019.529638e+019.498445e+019.487976e+01
pbad9.441106e+019.378577e+019.395768e+019.397356e+019.383996e+019.410753e+019.411721e+019.378061e+019.375465e+019.376910e+01
FPR4.967864e+004.597353e+004.642722e+004.846881e+004.854442e+004.832123e+004.907743e+004.446461e+004.756503e+004.862371e+00
TPR9.416562e+019.344417e+019.364021e+019.367158e+019.352258e+019.382009e+019.383578e+019.342797e+019.342012e+019.344365e+01
TNR9.503214e+019.540265e+019.535728e+019.515312e+019.514556e+019.516788e+019.509226e+019.555354e+019.524350e+019.513763e+01
MCC8.921064e-018.888894e-018.903234e-018.885265e-018.869994e-018.901243e-018.895008e-018.902924e-018.870135e-018.861506e-01
threshold3.400000e-014.000000e-013.700000e-014.000000e-013.800000e-013.600000e-013.500000e-014.100000e-013.900000e-013.500000e-01
AUC9.866436e-019.859242e-019.862077e-019.853778e-019.852447e-019.856918e-019.860954e-019.862259e-019.856574e-019.850677e-01
A data.frame: 1 × 14
TPFNFPTNF1accpgoodpbadFPRTPRTNRMCCthresholdAUC
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
119408116311259394.3194.4594.9893.954.7793.6495.230.890.380.99
In [79]:
# Create a data frame to compare results from different experiments
allResults<-cbind(allResults,data.frame(DecisionTree_with_Kfold=unlist(measures$means)))
allResults
A data.frame: 14 × 4
LogisticRegressionLogistic_with_KfoldDecisionTreeDecisionTree_with_Kfold
<dbl><dbl><dbl><dbl>
TP3.090200e+0410345.003.566700e+0411940.00
FN7.486000e+03 2405.002.721000e+03 811.00
FP5.755000e+03 1928.002.032000e+03 631.00
TN3.378500e+0411296.003.750800e+0412593.00
F18.235592e+01 82.689.375320e+01 94.31
acc8.300867e+01 83.319.390078e+01 94.45
pgood8.430041e+01 84.309.460994e+01 94.98
pbad8.186136e+01 82.459.323622e+01 93.95
FPR1.455488e+01 14.585.139100e+00 4.77
TPR8.049911e+01 81.139.291185e+01 93.64
TNR8.544512e+01 85.429.486090e+01 95.23
MCC6.605291e-01 0.678.780945e-01 0.89
threshold5.100000e-01 0.514.200000e-01 0.38
AUC9.002825e-01 0.909.830880e-01 0.99
In [ ]:

In [80]:
#Clear cache prevent kernel kill
ls()
rm(list=setdiff(ls(),c("processed_data","DecisionTree","allResults","training_data","testing_data","OUTPUT_FIELD","combined")))
source('utils.R')
ls()
  1. 'accuracy'
  2. 'allocateFoldID'
  3. 'allResults'
  4. 'auc'
  5. 'auroc'
  6. 'BCE_loss'
  7. 'combined'
  8. 'ConfusionMatrix_plot'
  9. 'dataset'
  10. 'DecisionTree'
  11. 'df'
  12. 'DT'
  13. 'eval_model'
  14. 'f1_score'
  15. 'F1_score'
  16. 'ForestCoeff_plot'
  17. 'KFOLDS'
  18. 'Lgm'
  19. 'LogisticCoeff_plot'
  20. 'LogisticCurve_plot'
  21. 'LogisticRegressionModel'
  22. 'loss'
  23. 'measures'
  24. 'myModelFormula'
  25. 'NConvertClass'
  26. 'NdetermineThreshold'
  27. 'OUTPUT_FIELD'
  28. 'precision'
  29. 'processed_data'
  30. 'recall'
  31. 'result'
  32. 'runExperiment'
  33. 'seed'
  34. 'select_field'
  35. 'selected_data'
  36. 'stratifiedDataset'
  37. 'stratifiedSplit'
  38. 'testing_data'
  39. 'threshold'
  40. 'training_data'
  41. 'training_records'
  42. 'TreeCoeff_plot'
  43. 'y_train'
  1. 'allocateFoldID'
  2. 'allResults'
  3. 'auroc'
  4. 'BCE_loss'
  5. 'combined'
  6. 'ConfusionMatrix_plot'
  7. 'DecisionTree'
  8. 'eval_model'
  9. 'F1_score'
  10. 'ForestCoeff_plot'
  11. 'LogisticCoeff_plot'
  12. 'LogisticCurve_plot'
  13. 'NConvertClass'
  14. 'NdetermineThreshold'
  15. 'OUTPUT_FIELD'
  16. 'processed_data'
  17. 'runExperiment'
  18. 'stratifiedDataset'
  19. 'stratifiedSplit'
  20. 'testing_data'
  21. 'training_data'
  22. 'TreeCoeff_plot'

Decision Tree on unclean dataset¶

In [81]:
OUTPUT_FIELD = "satisfactionv2"
source('utils.R')
selected_data<-NConvertClass(combined,OUTPUT_FIELD)

selected_data<-selected_data[sample(nrow(selected_data)),]
training_records<-round(nrow(selected_data)*(70/100))
training_data <- selected_data[1:training_records,]
testing_data <- selected_data[-(1:training_records),]

print(paste("Number of Training",nrow(training_data)))
print(paste("Number of Testing",nrow(testing_data)))
head(training_data)
[1] "Number of Training 181832"
[1] "Number of Testing 77928"
A data.frame: 6 × 24
satisfactionv2GenderCustomerTypeAgeTypeofTravelClassFlightDistanceSeatcomfortDepartureArrivaltimeconvenientFoodanddrink⋯EaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboardingDepartureDelayinMinutesArrivalDelayinMinutesInflightservice
<chr><chr><chr><int><chr><chr><int><int><int><int>⋯<int><int><int><int><int><int><int><int><int><int>
1555881Male Loyal Customer 8Personal TravelEco 334545⋯353555320 5
824131Male Loyal Customer 47Business travelEco 1952333⋯341424300NA
1598981FemaleLoyal Customer 37Personal TravelEco 247353⋯355433300 4
428491Femaledisloyal Customer40Business travelBusiness1511111⋯123413100NA
426211Femaledisloyal Customer35Business travelEco 2358111⋯542323500NA
526781Femaledisloyal Customer49Business travelBusiness1550333⋯435445400NA
In [82]:
uncleanDT<-DecisionTree(training_data,testing_data,plot=TRUE,OUTPUT_FIELD)
In [83]:
result<-uncleanDT$result
In [84]:
options(repr.plot.width = 10, repr.plot.height = 8)
ConfusionMatrix_plot(result$gt,result$pred_labels)

accuracy<-result$acc
precision<-result$pgood
recall<-result$TPR
auc <- result$AUC
f1_score <- F1_score(precision,recall)
loss <- BCE_loss(as.numeric(result$gt),as.numeric(result$proba))
threshold<- result$threshold
print(paste("ACCURACY : ", accuracy,
            ", PRECISION: ", precision,
            ", RECALL: ", recall, 
            ", AUC : ",auc,
            "F1 score:", f1_score,
            "Loss :", loss,
             "Threshold :", threshold))
[1] "ACCURACY :  94.7682476131814 , PRECISION:  94.0089641434263 , RECALL:  95.7615726062143 , AUC :  0.986940094481763 F1 score: 94.8771753471131 Loss : 0.141069008438183 Threshold : 0.63"
In [ ]:

In [85]:
df <-unlist(result[-which(names(result) %in%c("pred_labels","gt","proba"))])
#allResults<-
allResults<-cbind(allResults,data.frame(DecisionTree_unclean=unlist(df)))
allResults
A data.frame: 14 × 5
LogisticRegressionLogistic_with_KfoldDecisionTreeDecisionTree_with_KfoldDecisionTree_unclean
<dbl><dbl><dbl><dbl><dbl>
TP3.090200e+0410345.003.566700e+0411940.003.775400e+04
FN7.486000e+03 2405.002.721000e+03 811.001.671000e+03
FP5.755000e+03 1928.002.032000e+03 631.002.406000e+03
TN3.378500e+0411296.003.750800e+0412593.003.609700e+04
F18.235592e+01 82.689.375320e+01 94.319.487718e+01
acc8.300867e+01 83.319.390078e+01 94.459.476825e+01
pgood8.430041e+01 84.309.460994e+01 94.989.400896e+01
pbad8.186136e+01 82.459.323622e+01 93.959.557562e+01
FPR1.455488e+01 14.585.139100e+00 4.776.248864e+00
TPR8.049911e+01 81.139.291185e+01 93.649.576157e+01
TNR8.544512e+01 85.429.486090e+01 95.239.375114e+01
MCC6.605291e-01 0.678.780945e-01 0.898.954864e-01
threshold5.100000e-01 0.514.200000e-01 0.386.300000e-01
AUC9.002825e-01 0.909.830880e-01 0.999.869401e-01

K-fold decision tree with Unclean data¶

In [86]:
KFOLDS <- 10  # Number of folded experiments
dataset<-stratifiedDataset(selected_data)
measures<-runExperiment(dataset = dataset,FUN = DecisionTree)
[1] "FOLD : , 1"
[1] "FOLD : , 2"
[1] "FOLD : , 3"
[1] "FOLD : , 4"
[1] "FOLD : , 5"
[1] "FOLD : , 6"
[1] "FOLD : , 7"
[1] "FOLD : , 8"
[1] "FOLD : , 9"
[1] "FOLD : , 10"
In [87]:
data.frame(t(measures$allresults))

data.frame(measures$means)
A data.frame: 14 × 10
X1X2X3X4X5X6X7X8X9X10
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
TP1.264800e+041.269300e+041.271500e+041.276000e+041.269800e+041.268400e+041.267000e+041.260900e+041.265900e+041.269900e+04
FN5.770000e+025.320000e+025.100000e+024.650000e+025.270000e+025.400000e+025.540000e+026.150000e+025.650000e+025.250000e+02
FP7.290000e+027.740000e+028.000000e+027.690000e+028.060000e+028.070000e+027.950000e+027.540000e+028.050000e+028.400000e+02
TN1.202300e+041.197800e+041.195200e+041.198300e+041.194600e+041.194400e+041.195600e+041.199700e+041.194600e+041.191100e+04
F19.509059e+019.510715e+019.510097e+019.538761e+019.501291e+019.495789e+019.494548e+019.485087e+019.486661e+019.489967e+01
acc9.497248e+019.497248e+019.495708e+019.524964e+019.486854e+019.481424e+019.480654e+019.472955e+019.472570e+019.474495e+01
pgood9.455035e+019.425262e+019.408065e+019.431591e+019.403140e+019.401823e+019.409580e+019.435755e+019.402109e+019.379570e+01
pbad9.542063e+019.574740e+019.590756e+019.626446e+019.577487e+019.567446e+019.557154e+019.512369e+019.548397e+019.577839e+01
FPR5.716750e+006.069636e+006.273526e+006.030427e+006.320577e+006.328915e+006.234805e+005.913262e+006.313230e+006.587719e+00
TPR9.563705e+019.597732e+019.614367e+019.648393e+019.601512e+019.591652e+019.581065e+019.534936e+019.572747e+019.602995e+01
TNR9.428325e+019.393036e+019.372647e+019.396957e+019.367942e+019.367108e+019.376519e+019.408674e+019.368677e+019.341228e+01
MCC8.994564e-018.995384e-018.992916e-019.051692e-018.975039e-018.964013e-018.962158e-018.945867e-018.945964e-018.950813e-01
threshold6.400000e-016.200000e-016.300000e-016.300000e-016.300000e-016.200000e-016.300000e-016.600000e-016.200000e-015.900000e-01
AUC9.877496e-019.877244e-019.871862e-019.873950e-019.874951e-019.880413e-019.875754e-019.874538e-019.871841e-019.865964e-01
A data.frame: 1 × 14
TPFNFPTNF1accpgoodpbadFPRTPRTNRMCCthresholdAUC
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
126835417871196395.0294.8894.1595.676.1895.9193.820.90.630.99
In [88]:
# Create a data frame to compare results from different experiments
allResults<-cbind(allResults,data.frame(DecisionTree_Kfold_Unclean=unlist(measures$means)))
allResults
A data.frame: 14 × 6
LogisticRegressionLogistic_with_KfoldDecisionTreeDecisionTree_with_KfoldDecisionTree_uncleanDecisionTree_Kfold_Unclean
<dbl><dbl><dbl><dbl><dbl><dbl>
TP3.090200e+0410345.003.566700e+0411940.003.775400e+0412683.00
FN7.486000e+03 2405.002.721000e+03 811.001.671000e+03 541.00
FP5.755000e+03 1928.002.032000e+03 631.002.406000e+03 787.00
TN3.378500e+0411296.003.750800e+0412593.003.609700e+0411963.00
F18.235592e+01 82.689.375320e+01 94.319.487718e+01 95.02
acc8.300867e+01 83.319.390078e+01 94.459.476825e+01 94.88
pgood8.430041e+01 84.309.460994e+01 94.989.400896e+01 94.15
pbad8.186136e+01 82.459.323622e+01 93.959.557562e+01 95.67
FPR1.455488e+01 14.585.139100e+00 4.776.248864e+00 6.18
TPR8.049911e+01 81.139.291185e+01 93.649.576157e+01 95.91
TNR8.544512e+01 85.429.486090e+01 95.239.375114e+01 93.82
MCC6.605291e-01 0.678.780945e-01 0.898.954864e-01 0.90
threshold5.100000e-01 0.514.200000e-01 0.386.300000e-01 0.63
AUC9.002825e-01 0.909.830880e-01 0.999.869401e-01 0.99

Random Forest Classification¶

Random Forest is an ensemble learning model in machine learning, It can be use in both regression or classification. It consist of mulitple number of decision tree inside which each tree spit out a prediction of a class and prediction of a class with the most occur or vote from trees become a final result of a model.

Using this model may improve the result from traditional Decision tree but it has longer training time since there are mulitple tree.

Random Forest on Clean dataset¶

In [89]:
# Select field for Random forest
OUTPUT_FIELD = "satisfactionv2"

select_field<-c('satisfactionv2',
               'Gender',
                'CustomerType',
                'TypeofTravel',
               
                'ClassEco',
                'ClassEco.Plus',
                'DepartureDelayinMinutes0',
                'DepartureDelayinMinutes1',
                'DepartureDelayinMinutes2',

                'ArrivalDelayinMinutes0',
                'ArrivalDelayinMinutes1',
                'ArrivalDelayinMinutes2',
                'ArrivalDelayinMinutes3',
                'Age',
                'Seatcomfort',
                'DepartureArrivaltimeconvenient',
                'Foodanddrink',
                'Gatelocation',
                'Inflightwifiservice',
                'Inflightentertainment',
               'EaseofOnlinebooking',
                 'Onboardservice',
                 'Legroomservice',
                 'Baggagehandling',
                 'Checkinservice',
                 'Cleanliness',
               'Onlineboarding'
               )

print("Select Fields")
print((select_field))


selected_data<-processed_data[,select_field]

selected_data<-selected_data[sample(nrow(selected_data)),]
training_records<-round(nrow(selected_data)*(70/100))
training_data <- selected_data[1:training_records,]
testing_data <- selected_data[-(1:training_records),]

print(paste("Number of Training",nrow(training_data)))
print(paste("Number of Testing",nrow(testing_data)))
head(training_data)
[1] "Select Fields"
 [1] "satisfactionv2"                 "Gender"                        
 [3] "CustomerType"                   "TypeofTravel"                  
 [5] "ClassEco"                       "ClassEco.Plus"                 
 [7] "DepartureDelayinMinutes0"       "DepartureDelayinMinutes1"      
 [9] "DepartureDelayinMinutes2"       "ArrivalDelayinMinutes0"        
[11] "ArrivalDelayinMinutes1"         "ArrivalDelayinMinutes2"        
[13] "ArrivalDelayinMinutes3"         "Age"                           
[15] "Seatcomfort"                    "DepartureArrivaltimeconvenient"
[17] "Foodanddrink"                   "Gatelocation"                  
[19] "Inflightwifiservice"            "Inflightentertainment"         
[21] "EaseofOnlinebooking"            "Onboardservice"                
[23] "Legroomservice"                 "Baggagehandling"               
[25] "Checkinservice"                 "Cleanliness"                   
[27] "Onlineboarding"                
[1] "Number of Training 181832"
[1] "Number of Testing 77928"
A data.frame: 6 × 27
satisfactionv2GenderCustomerTypeTypeofTravelClassEcoClassEco.PlusDepartureDelayinMinutes0DepartureDelayinMinutes1DepartureDelayinMinutes2ArrivalDelayinMinutes0⋯GatelocationInflightwifiserviceInflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboarding
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>⋯<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
844080101101001⋯0.40.20.60.60.60.60.60.80.60.6
1296581101001001⋯0.81.00.61.01.01.01.00.41.00.4
118801100101001⋯0.80.80.80.80.41.00.80.20.80.8
1034981001000100⋯0.20.41.00.80.80.80.80.60.80.6
1918801011101001⋯0.81.01.01.00.21.00.40.21.01.0
950911001000100⋯0.41.01.00.80.80.80.81.00.80.6
In [90]:
RandomForestClassifier<-function(training_data, testing_data, plot=TRUE, output_field="satisfactionv2"){
    
    
# Training data
x_train<-training_data[,names(training_data [-which(names(training_data)%in% output_field)])]
y_train<-training_data[,output_field]
# Testing data
x_test<-testing_data[,names(training_data [-which(names(testing_data)%in% output_field)])]
y_test<-testing_data[,output_field]



RandForestModel<-randomForest::randomForest(x_train,
                               factor(y_train),
                               ntree=50 ,
                               importance=TRUE,
                               mtry=sqrt(ncol(x_train)),
                               na.action=randomForest::na.roughfix)
    
#y_pred<-predict()
    
    
class_proba<-predict(RandForestModel, newdata=x_test, type="prob")

# Get the column index with the class label
classIndex<-which(as.numeric(colnames(class_proba))==1)

# Get the probabilities for classifying the good loans
y_proba<-class_proba[,classIndex]


title<-'Random Forest'
results<-NdetermineThreshold(test_expected=y_test,
                                test_predicted=y_proba,
                                plot=plot,
                                title=title)

accuracy<-results$acc
precision<-results$pgood
recall<-results$TPR
auc<-results$AUC

    
return(list("model"=RandForestModel,"result"=results))
}
In [91]:
RFmodel<-RandomForestClassifier(training_data,testing_data,plot=TRUE,OUTPUT_FIELD)
result<-RFmodel$result

Confusion Matrix¶

In [92]:
options(repr.plot.width = 10, repr.plot.height =8)
ConfusionMatrix_plot(result$gt,result$pred_labels)

accuracy<-result$acc
precision<-result$pgood
recall<-result$TPR
auc <- result$AUC
f1_score <- F1_score(precision,recall)
loss <- BCE_loss(result$gt,result$proba)
threshold<- result$threshold
print(paste("ACCURACY : ", accuracy,
            ", PRECISION: ", precision,
            ", RECALL: ", recall, 
            ", AUC : ",auc,
            "F1 score:", f1_score,
            "Loss :", loss,
            "Threshold :", threshold))
[1] "ACCURACY :  95.0544091982343 , PRECISION:  95.6612664125371 , RECALL:  94.2245515227368 , AUC :  0.989395080289534 F1 score: 94.9374737284573 Loss : 0.14175898380384 Threshold : 0.47"

Impotance Variable(fields) on Random Forest Classifer¶

In [93]:
ForestCoeff_plot(RFmodel$model)
In [94]:
options(repr.plot.width = 10, repr.plot.height = 8)
# Variable importance plot
randomForest::varImpPlot(RFmodel$model)

Dimensionality reduction¶

In [95]:
# Select field for Random forest
OUTPUT_FIELD = "satisfactionv2"

select_field<-c('satisfactionv2',
               'Gender',
                'CustomerType',
                'TypeofTravel',
               
                'ClassEco',
                'ClassEco.Plus',
#                 'DepartureDelayinMinutes0',
#                 'DepartureDelayinMinutes1',
#                 'DepartureDelayinMinutes2',

#                 'ArrivalDelayinMinutes0',
#                 'ArrivalDelayinMinutes1',
#                 'ArrivalDelayinMinutes2',
#                 'ArrivalDelayinMinutes3',
                'Age',
                'Seatcomfort',
                'DepartureArrivaltimeconvenient',
                'Foodanddrink',
                'Gatelocation',
                'Inflightwifiservice',
                'Inflightentertainment',
               'EaseofOnlinebooking',
                 'Onboardservice',
                 'Legroomservice',
                 'Baggagehandling',
                 'Checkinservice',
                 'Cleanliness',
               'Onlineboarding'
               )

print("Select Fields")
print((select_field))


selected_data<-processed_data[,select_field]

selected_data<-selected_data[sample(nrow(selected_data)),]
training_records<-round(nrow(selected_data)*(70/100))
training_data <- selected_data[1:training_records,]
testing_data <- selected_data[-(1:training_records),]

print(paste("Number of Training",nrow(training_data)))
print(paste("Number of Testing",nrow(testing_data)))
head(training_data)
[1] "Select Fields"
 [1] "satisfactionv2"                 "Gender"                        
 [3] "CustomerType"                   "TypeofTravel"                  
 [5] "ClassEco"                       "ClassEco.Plus"                 
 [7] "Age"                            "Seatcomfort"                   
 [9] "DepartureArrivaltimeconvenient" "Foodanddrink"                  
[11] "Gatelocation"                   "Inflightwifiservice"           
[13] "Inflightentertainment"          "EaseofOnlinebooking"           
[15] "Onboardservice"                 "Legroomservice"                
[17] "Baggagehandling"                "Checkinservice"                
[19] "Cleanliness"                    "Onlineboarding"                
[1] "Number of Training 181832"
[1] "Number of Testing 77928"
A data.frame: 6 × 20
satisfactionv2GenderCustomerTypeTypeofTravelClassEcoClassEco.PlusAgeSeatcomfortDepartureArrivaltimeconvenientFoodanddrinkGatelocationInflightwifiserviceInflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboarding
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
2008411101100.47945210.21.00.21.00.40.41.00.40.40.41.00.20.4
72200000100.57534250.21.00.20.60.60.20.61.00.40.80.81.00.6
439700011100.27397260.20.60.20.60.80.20.80.60.60.60.40.80.8
2011090101100.72602740.60.80.20.80.40.40.80.40.40.40.20.80.4
626031011000.28767121.01.01.00.20.60.80.60.80.40.81.00.80.6
1311720000000.16438360.20.20.20.40.20.20.20.40.60.20.60.20.2
In [96]:
RFmodel<-RandomForestClassifier(training_data,testing_data,plot=TRUE,OUTPUT_FIELD)
result<-RFmodel$result
In [97]:
options(repr.plot.width = 10, repr.plot.height =8)
ConfusionMatrix_plot(result$gt,result$pred_labels)

accuracy<-result$acc
precision<-result$pgood
recall<-result$TPR
auc <- result$AUC
f1_score <- F1_score(precision,recall)
loss <- BCE_loss(result$gt,result$proba)
threshold<- result$threshold
print(paste("ACCURACY : ", accuracy,
            ", PRECISION: ", precision,
            ", RECALL: ", recall, 
            ", AUC : ",auc,
            "F1 score:", f1_score,
            "Loss :", loss,
            "Threshold :", threshold))
[1] "ACCURACY :  95.0993224514937 , PRECISION:  95.0523086604263 , RECALL:  94.9402891263356 , AUC :  0.989790760938095 F1 score: 94.9962658700523 Loss : 0.137277759981508 Threshold : 0.43"
In [98]:
ForestCoeff_plot(RFmodel$model)
In [99]:
df <-unlist(result[-which(names(result) %in%c("pred_labels","gt","proba"))])
#allResults<-
allResults<-cbind(allResults,data.frame(RandomForest=unlist(df)))
allResults
A data.frame: 14 × 7
LogisticRegressionLogistic_with_KfoldDecisionTreeDecisionTree_with_KfoldDecisionTree_uncleanDecisionTree_Kfold_UncleanRandomForest
<dbl><dbl><dbl><dbl><dbl><dbl><dbl>
TP3.090200e+0410345.003.566700e+0411940.003.775400e+0412683.003.625200e+04
FN7.486000e+03 2405.002.721000e+03 811.001.671000e+03 541.001.932000e+03
FP5.755000e+03 1928.002.032000e+03 631.002.406000e+03 787.001.887000e+03
TN3.378500e+0411296.003.750800e+0412593.003.609700e+0411963.003.785700e+04
F18.235592e+01 82.689.375320e+01 94.319.487718e+01 95.029.499627e+01
acc8.300867e+01 83.319.390078e+01 94.459.476825e+01 94.889.509932e+01
pgood8.430041e+01 84.309.460994e+01 94.989.400896e+01 94.159.505231e+01
pbad8.186136e+01 82.459.323622e+01 93.959.557562e+01 95.679.514439e+01
FPR1.455488e+01 14.585.139100e+00 4.776.248864e+00 6.184.747886e+00
TPR8.049911e+01 81.139.291185e+01 93.649.576157e+01 95.919.494029e+01
TNR8.544512e+01 85.429.486090e+01 95.239.375114e+01 93.829.525211e+01
MCC6.605291e-01 0.678.780945e-01 0.898.954864e-01 0.909.019455e-01
threshold5.100000e-01 0.514.200000e-01 0.386.300000e-01 0.634.300000e-01
AUC9.002825e-01 0.909.830880e-01 0.999.869401e-01 0.999.897908e-01
In [100]:
#Clear cache prevent kernel kill
ls()
rm(list=setdiff(ls(),c("processed_data","RandomForestClassifier","DecisionTree","allResults","training_data","testing_data","OUTPUT_FIELD","combined")))
source('utils.R')
ls()
  1. 'accuracy'
  2. 'allocateFoldID'
  3. 'allResults'
  4. 'auc'
  5. 'auroc'
  6. 'BCE_loss'
  7. 'combined'
  8. 'ConfusionMatrix_plot'
  9. 'dataset'
  10. 'DecisionTree'
  11. 'df'
  12. 'eval_model'
  13. 'f1_score'
  14. 'F1_score'
  15. 'ForestCoeff_plot'
  16. 'KFOLDS'
  17. 'LogisticCoeff_plot'
  18. 'LogisticCurve_plot'
  19. 'loss'
  20. 'measures'
  21. 'NConvertClass'
  22. 'NdetermineThreshold'
  23. 'OUTPUT_FIELD'
  24. 'precision'
  25. 'processed_data'
  26. 'RandomForestClassifier'
  27. 'recall'
  28. 'result'
  29. 'RFmodel'
  30. 'runExperiment'
  31. 'select_field'
  32. 'selected_data'
  33. 'stratifiedDataset'
  34. 'stratifiedSplit'
  35. 'testing_data'
  36. 'threshold'
  37. 'training_data'
  38. 'training_records'
  39. 'TreeCoeff_plot'
  40. 'uncleanDT'
  1. 'allocateFoldID'
  2. 'allResults'
  3. 'auroc'
  4. 'BCE_loss'
  5. 'combined'
  6. 'ConfusionMatrix_plot'
  7. 'DecisionTree'
  8. 'eval_model'
  9. 'F1_score'
  10. 'ForestCoeff_plot'
  11. 'LogisticCoeff_plot'
  12. 'LogisticCurve_plot'
  13. 'NConvertClass'
  14. 'NdetermineThreshold'
  15. 'OUTPUT_FIELD'
  16. 'processed_data'
  17. 'RandomForestClassifier'
  18. 'runExperiment'
  19. 'stratifiedDataset'
  20. 'stratifiedSplit'
  21. 'testing_data'
  22. 'training_data'
  23. 'TreeCoeff_plot'

K-FOLD Cross Validation on Random Forest on clean dataset¶

In [101]:
KFOLDS <- 6# Number of folded 
dataset<-stratifiedDataset(processed_data)
measures<-runExperiment(dataset = dataset,FUN = RandomForestClassifier)
[1] "FOLD : , 1"
[1] "FOLD : , 2"
[1] "FOLD : , 3"
[1] "FOLD : , 4"
[1] "FOLD : , 5"
[1] "FOLD : , 6"
In [102]:
data.frame(measures$allresults)
A data.frame: 6 × 14
TPFNFPTNF1accpgoodpbadFPRTPRTNRMCCthresholdAUC
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
2007111828412120095.2021895.3273095.9783994.718973.81561694.4384396.184380.90660080.470.9908579
2011111429652107695.0223395.1332795.4213394.860024.37820494.6266495.621800.90264890.450.9906108
2014911049752106695.0940495.1979595.3844095.020304.42357494.8054495.576430.90393280.450.9908183
2014011129592108295.1099195.2163295.4547694.989644.35098294.7675595.649020.90430480.450.9903975
2008711658982114395.1156695.2348095.7207594.777664.07422594.5181695.925770.90471170.470.9904982
2016510879262111495.2459795.3501895.6095095.103824.20145294.8851995.798550.90698530.450.9908857
In [103]:
# Create a data frame to compare results from different experiments
allResults<-cbind(allResults,data.frame(RandomForestKfold=unlist(measures$means)))
allResults
A data.frame: 14 × 8
LogisticRegressionLogistic_with_KfoldDecisionTreeDecisionTree_with_KfoldDecisionTree_uncleanDecisionTree_Kfold_UncleanRandomForestRandomForestKfold
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
TP3.090200e+0410345.003.566700e+0411940.003.775400e+0412683.003.625200e+0420120.00
FN7.486000e+03 2405.002.721000e+03 811.001.671000e+03 541.001.932000e+03 1132.00
FP5.755000e+03 1928.002.032000e+03 631.002.406000e+03 787.001.887000e+03 927.00
TN3.378500e+0411296.003.750800e+0412593.003.609700e+0411963.003.785700e+0421113.00
F18.235592e+01 82.689.375320e+01 94.319.487718e+01 95.029.499627e+01 95.13
acc8.300867e+01 83.319.390078e+01 94.459.476825e+01 94.889.509932e+01 95.24
pgood8.430041e+01 84.309.460994e+01 94.989.400896e+01 94.159.505231e+01 95.59
pbad8.186136e+01 82.459.323622e+01 93.959.557562e+01 95.679.514439e+01 94.91
FPR1.455488e+01 14.585.139100e+00 4.776.248864e+00 6.184.747886e+00 4.21
TPR8.049911e+01 81.139.291185e+01 93.649.576157e+01 95.919.494029e+01 94.67
TNR8.544512e+01 85.429.486090e+01 95.239.375114e+01 93.829.525211e+01 95.79
MCC6.605291e-01 0.678.780945e-01 0.898.954864e-01 0.909.019455e-01 0.90
threshold5.100000e-01 0.514.200000e-01 0.386.300000e-01 0.634.300000e-01 0.46
AUC9.002825e-01 0.909.830880e-01 0.999.869401e-01 0.999.897908e-01 0.99

Random Forest on Unclean dataset¶

In [104]:
OUTPUT_FIELD = "satisfactionv2"
source('utils.R')
selected_data<-NConvertClass(combined,OUTPUT_FIELD)

# Fill up missing values (Immputation)
missing_field<-c('Inflightservice','Onlinesupport','ArrivalDelayinMinutes')
selected_data[,missing_field]<-(randomForest::na.roughfix(selected_data[,missing_field]))

selected_data<-selected_data[sample(nrow(selected_data)),]
training_records<-round(nrow(selected_data)*(70/100))
training_data <- selected_data[1:training_records,]
testing_data <- selected_data[-(1:training_records),]

print(paste("Number of Training",nrow(training_data)))
print(paste("Number of Testing",nrow(testing_data)))
head(training_data)
[1] "Number of Training 181832"
[1] "Number of Testing 77928"
A data.frame: 6 × 24
satisfactionv2GenderCustomerTypeAgeTypeofTravelClassFlightDistanceSeatcomfortDepartureArrivaltimeconvenientFoodanddrink⋯EaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboardingDepartureDelayinMinutesArrivalDelayinMinutesInflightservice
<chr><chr><chr><int><chr><chr><int><int><int><int>⋯<int><int><int><int><int><int><int><int><dbl><dbl>
1694530FemaleLoyal Customer 66Personal TravelEco 429143⋯5355332 0 53
460871Femaledisloyal Customer24Business travelEco 2393222⋯525542515104
1124780Male Loyal Customer 45Business travelBusiness 397111⋯445444460414
1295160Male Loyal Customer 37Business travelBusiness2546111⋯555525536304
2074301FemaleLoyal Customer 27Business travelEco Plus 852252⋯521432261583
55181Male Loyal Customer 47Personal TravelEco 2478142⋯3335353 0 34
In [105]:
RFmodel_unclean<-RandomForestClassifier(training_data,testing_data,plot=TRUE,OUTPUT_FIELD)
result<-RFmodel_unclean$result
In [106]:
options(repr.plot.width = 10, repr.plot.height =8)
ConfusionMatrix_plot(result$gt,result$pred_labels)
accuracy<-result$acc
precision<-result$pgood
recall<-result$TPR
auc <- result$AUC
f1_score <- F1_score(precision,recall)
loss <- #BCE_loss(result$gt,result$proba)
threshold<- result$threshold
print(paste("ACCURACY : ", accuracy,
            ", PRECISION: ", precision,
            ", RECALL: ", recall, 
            ", AUC : ",auc,
            "F1 score:", f1_score,
            "Loss :", loss,
            "Threshold :", threshold))
[1] "ACCURACY :  95.751206241659 , PRECISION:  95.4997383568812 , RECALL:  96.2214411247803 , AUC :  0.992602335979148 F1 score: 95.8592313752955 Loss : 0.53 Threshold : 0.53"
In [107]:
df <-unlist(result[-which(names(result) %in%c("pred_labels","gt","proba"))])
#allResults<-
allResults<-cbind(allResults,data.frame(RandomForest_unclean=unlist(df)))
allResults
A data.frame: 14 × 9
LogisticRegressionLogistic_with_KfoldDecisionTreeDecisionTree_with_KfoldDecisionTree_uncleanDecisionTree_Kfold_UncleanRandomForestRandomForestKfoldRandomForest_unclean
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
TP3.090200e+0410345.003.566700e+0411940.003.775400e+0412683.003.625200e+0420120.003.832500e+04
FN7.486000e+03 2405.002.721000e+03 811.001.671000e+03 541.001.932000e+03 1132.001.505000e+03
FP5.755000e+03 1928.002.032000e+03 631.002.406000e+03 787.001.887000e+03 927.001.806000e+03
TN3.378500e+0411296.003.750800e+0412593.003.609700e+0411963.003.785700e+0421113.003.629200e+04
F18.235592e+01 82.689.375320e+01 94.319.487718e+01 95.029.499627e+01 95.139.585923e+01
acc8.300867e+01 83.319.390078e+01 94.459.476825e+01 94.889.509932e+01 95.249.575121e+01
pgood8.430041e+01 84.309.460994e+01 94.989.400896e+01 94.159.505231e+01 95.599.549974e+01
pbad8.186136e+01 82.459.323622e+01 93.959.557562e+01 95.679.514439e+01 94.919.601820e+01
FPR1.455488e+01 14.585.139100e+00 4.776.248864e+00 6.184.747886e+00 4.214.740406e+00
TPR8.049911e+01 81.139.291185e+01 93.649.576157e+01 95.919.494029e+01 94.679.622144e+01
TNR8.544512e+01 85.429.486090e+01 95.239.375114e+01 93.829.525211e+01 95.799.525959e+01
MCC6.605291e-01 0.678.780945e-01 0.898.954864e-01 0.909.019455e-01 0.909.149949e-01
threshold5.100000e-01 0.514.200000e-01 0.386.300000e-01 0.634.300000e-01 0.465.300000e-01
AUC9.002825e-01 0.909.830880e-01 0.999.869401e-01 0.999.897908e-01 0.999.926023e-01

K-FOLD Cross validation with unclen dataset on Randomforest¶

In [108]:
KFOLDS<- 6 # Number of folded 
dataset<-stratifiedDataset(processed_data)
measures<-runExperiment(dataset = dataset,FUN = RandomForestClassifier)
[1] "FOLD : , 1"
[1] "FOLD : , 2"
[1] "FOLD : , 3"
[1] "FOLD : , 4"
[1] "FOLD : , 5"
[1] "FOLD : , 6"
In [109]:
data.frame(measures$allresults)

data.frame(measures$means)
A data.frame: 6 × 14
TPFNFPTNF1accpgoodpbadFPRTPRTNRMCCthresholdAUC
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
201471106 9572108495.1294995.2349195.4653195.015774.34190894.7960395.658090.90467600.450.9905550
20126112710022103994.9765295.0824695.2574894.915644.54607394.6972295.453930.90162130.450.9906171
201261127 9742106795.0393195.1471395.3838994.922054.41903794.6972295.580960.90292060.450.9908081
201181134 9512109095.0733795.1839895.4862694.897414.31468694.6640395.685310.90366500.450.9901988
201661086 9612108095.1697895.2717595.4513295.100604.36005694.8898995.639940.90540880.450.9907016
201301122 9412109995.1255895.2346995.5341594.950724.26951094.7205095.730490.90467930.450.9908237
A data.frame: 1 × 14
TPFNFPTNF1accpgoodpbadFPRTPRTNRMCCthresholdAUC
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
2013511179642107695.0995.1995.4394.974.3894.7495.620.90.450.99
In [110]:
allResults<-cbind(allResults,data.frame(RandomForest_Kfold_unclean=unlist(measures$means)))
allResults
A data.frame: 14 × 10
LogisticRegressionLogistic_with_KfoldDecisionTreeDecisionTree_with_KfoldDecisionTree_uncleanDecisionTree_Kfold_UncleanRandomForestRandomForestKfoldRandomForest_uncleanRandomForest_Kfold_unclean
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
TP3.090200e+0410345.003.566700e+0411940.003.775400e+0412683.003.625200e+0420120.003.832500e+0420135.00
FN7.486000e+03 2405.002.721000e+03 811.001.671000e+03 541.001.932000e+03 1132.001.505000e+03 1117.00
FP5.755000e+03 1928.002.032000e+03 631.002.406000e+03 787.001.887000e+03 927.001.806000e+03 964.00
TN3.378500e+0411296.003.750800e+0412593.003.609700e+0411963.003.785700e+0421113.003.629200e+0421076.00
F18.235592e+01 82.689.375320e+01 94.319.487718e+01 95.029.499627e+01 95.139.585923e+01 95.09
acc8.300867e+01 83.319.390078e+01 94.459.476825e+01 94.889.509932e+01 95.249.575121e+01 95.19
pgood8.430041e+01 84.309.460994e+01 94.989.400896e+01 94.159.505231e+01 95.599.549974e+01 95.43
pbad8.186136e+01 82.459.323622e+01 93.959.557562e+01 95.679.514439e+01 94.919.601820e+01 94.97
FPR1.455488e+01 14.585.139100e+00 4.776.248864e+00 6.184.747886e+00 4.214.740406e+00 4.38
TPR8.049911e+01 81.139.291185e+01 93.649.576157e+01 95.919.494029e+01 94.679.622144e+01 94.74
TNR8.544512e+01 85.429.486090e+01 95.239.375114e+01 93.829.525211e+01 95.799.525959e+01 95.62
MCC6.605291e-01 0.678.780945e-01 0.898.954864e-01 0.909.019455e-01 0.909.149949e-01 0.90
threshold5.100000e-01 0.514.200000e-01 0.386.300000e-01 0.634.300000e-01 0.465.300000e-01 0.45
AUC9.002825e-01 0.909.830880e-01 0.999.869401e-01 0.999.897908e-01 0.999.926023e-01 0.99
In [111]:
t(allResults)
A matrix: 10 × 14 of type dbl
TPFNFPTNF1accpgoodpbadFPRTPRTNRMCCthresholdAUC
LogisticRegression30902748657553378582.3559283.0086784.3004181.8613614.55488180.4991185.445120.66052910.510.9002825
Logistic_with_Kfold10345240519281129682.6800083.3100084.3000082.4500014.58000081.1300085.420000.67000000.510.9000000
DecisionTree35667272120323750893.7532093.9007894.6099493.23622 5.13910092.9118594.860900.87809450.420.9830880
DecisionTree_with_Kfold11940 811 6311259394.3100094.4500094.9800093.95000 4.77000093.6400095.230000.89000000.380.9900000
DecisionTree_unclean37754167124063609794.8771894.7682594.0089695.57562 6.24886495.7615793.751140.89548640.630.9869401
DecisionTree_Kfold_Unclean12683 541 7871196395.0200094.8800094.1500095.67000 6.18000095.9100093.820000.90000000.630.9900000
RandomForest36252193218873785794.9962795.0993295.0523195.14439 4.74788694.9402995.252110.90194550.430.9897908
RandomForestKfold201201132 9272111395.1300095.2400095.5900094.91000 4.21000094.6700095.790000.90000000.460.9900000
RandomForest_unclean38325150518063629295.8592395.7512195.4997496.01820 4.74040696.2214495.259590.91499490.530.9926023
RandomForest_Kfold_unclean201351117 9642107695.0900095.1900095.4300094.97000 4.38000094.7400095.620000.90000000.450.9900000
In [ ]:

In [ ]:

In [ ]:

Neural Network¶

In [112]:
library(tensorflow)
library(keras)
Attaching package: ‘tensorflow’


The following object is masked from ‘package:caret’:

    train


In [113]:
# Select field for NN
seed= 123
set.seed(seed)
OUTPUT_FIELD = "satisfactionv2"
source('utils.R')
select_field<-c('satisfactionv2',
               'Gender',
                'CustomerType',
                'TypeofTravel',
               
                'ClassEco',
                'ClassEco.Plus',
                'DepartureDelayinMinutes0',
                'DepartureDelayinMinutes1',
                'DepartureDelayinMinutes2',
         
                'ArrivalDelayinMinutes0',
                'ArrivalDelayinMinutes1',
                'ArrivalDelayinMinutes2',
                'ArrivalDelayinMinutes3',
                'Age',
                'Seatcomfort',
                'DepartureArrivaltimeconvenient',
                'Foodanddrink',
                'Gatelocation',
                'Inflightwifiservice',
                'Inflightentertainment',
               'EaseofOnlinebooking',
                 'Onboardservice',
                 'Legroomservice',
                 'Baggagehandling',
                 'Checkinservice',
                 'Cleanliness',
               'Onlineboarding'
               )

print("Select Fields")
print((select_field))


selected_data<-processed_data[,select_field]

selected_data<-selected_data[sample(nrow(selected_data)),]
training_records<-round(nrow(selected_data)*(85/100))
training_data <- selected_data[1:training_records,]
testing_data <- selected_data[-(1:training_records),]

print(paste("Number of Training",nrow(training_data)))
print(paste("Number of Testing",nrow(testing_data)))
head(training_data)
[1] "Select Fields"
 [1] "satisfactionv2"                 "Gender"                        
 [3] "CustomerType"                   "TypeofTravel"                  
 [5] "ClassEco"                       "ClassEco.Plus"                 
 [7] "DepartureDelayinMinutes0"       "DepartureDelayinMinutes1"      
 [9] "DepartureDelayinMinutes2"       "ArrivalDelayinMinutes0"        
[11] "ArrivalDelayinMinutes1"         "ArrivalDelayinMinutes2"        
[13] "ArrivalDelayinMinutes3"         "Age"                           
[15] "Seatcomfort"                    "DepartureArrivaltimeconvenient"
[17] "Foodanddrink"                   "Gatelocation"                  
[19] "Inflightwifiservice"            "Inflightentertainment"         
[21] "EaseofOnlinebooking"            "Onboardservice"                
[23] "Legroomservice"                 "Baggagehandling"               
[25] "Checkinservice"                 "Cleanliness"                   
[27] "Onlineboarding"                
[1] "Number of Training 220796"
[1] "Number of Testing 38964"
A data.frame: 6 × 27
satisfactionv2GenderCustomerTypeTypeofTravelClassEcoClassEco.PlusDepartureDelayinMinutes0DepartureDelayinMinutes1DepartureDelayinMinutes2ArrivalDelayinMinutes0⋯GatelocationInflightwifiserviceInflightentertainmentEaseofOnlinebookingOnboardserviceLegroomserviceBaggagehandlingCheckinserviceCleanlinessOnlineboarding
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>⋯<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
1827350111100100⋯0.40.60.80.60.40.80.40.60.80.6
1889420011001001⋯0.40.81.00.80.60.60.80.61.00.8
1340580000100001⋯0.40.20.60.20.60.80.80.80.80.8
1240221101001001⋯1.01.01.01.01.00.81.00.40.21.0
1609970000100100⋯0.80.61.00.60.80.80.60.81.00.6
2263181101001000⋯0.20.20.80.20.80.80.80.80.80.8
In [114]:
# ************************************************
# N_MLP_TrainClassifier()
#
# MLP NEURAL NETWORK
#
# INPUT:  Frame      - train              - scaled [0.0,1.0], fields & rows
#         String     - fieldNameOutput    - Name of the field to classify
#         Int Vector - hidden             - Number of hidden layer neurons for each layer
#         boolean    - plot               - TRUE = output charts/results
#
# OUTPUT: object     - trained neural network
# ************************************************
N_MLP_TrainClassifier<- function(train,
                                  fieldNameOutput,
                               
                                  plot
                                  ){

  positionClassOutput<-which(names(train)==fieldNameOutput)

  # train data: dataframe with the input fields
  train_inputs<-train[-positionClassOutput]

  # train data: vector with the expedcted output
  train_expected<-train[,positionClassOutput]

  x<-as.matrix(train_inputs)
  y<-as.matrix(train_expected)#keras::to_categorical(train_expected,num_classes = 2)

  mlp_classifier = keras_model_sequential()

  # add layers, first layer needs input dimension
  mlp_classifier %>%
    keras::layer_dense(input_shape = ncol(x), units=ncol(x), activation = "relu") %>%
    keras::layer_dropout(0.2) %>%
    keras::layer_dense(units = 32, activation = "relu") %>%
    keras::layer_dropout(0.2) %>%
    keras::layer_dense(units = 16, activation = "relu") %>%
    keras::layer_dropout(0.2) %>%
    
    keras::layer_dense(units = 1, activation = "sigmoid")

  # add a loss function and optimizer
  mlp_classifier %>%
    keras::compile(
      loss = "binary_crossentropy",
      optimizer = "adam",
      metrics = "accuracy"
    )

  # train model with our training data set
  fit = mlp_classifier %>%
    keras::fit(
      x = x,
      y = y,
      shuffle = T,
      batch_size = 32,
      validation_split = 0.1765,
      epochs = 50,
      callbacks = c(
        callback_early_stopping(monitor = "val_loss", patience = 8, mode = "auto"),
      callback_progbar_logger()),
      verbose=0, view_metrics=0,
    )

  # Plot the neural network error (loss) udring training
  if (plot==TRUE)
    print(plot(fit))

  return(mlp_classifier)
}

# ************************************************
# N_EVALUATE_MLP() :
#
# Evaluate MLP Neural Network classifier
# Generates probabilities from the classifier
#
# INPUT: Data Frame    -  testing_data     - scaled [0.0,1.0], fields & rows
#        String        -  fieldNameOutput  - Name of the field that we are training on (i.e.Status)
#        Object        - mlp_classifier    - trained NN including the learn weights, etc.
#         boolean      - plot              - TRUE = output charts/results
#         string       - myTitle           - title on results
#
# OUTPUT :
#         list - metrics from confusion matrix
# ************************************************

N_evaluate_MLP<-function(test,fieldNameOutput,mlp_classifier,plot,myTitle){

  positionClassOutput<-which(names(test)==fieldNameOutput)

  # test data: dataframe with with just input fields
  test_inputs<-test[-positionClassOutput]

  # Generate class membership probabilities
  # Column 1 is for class 0 (bad loan) and column 2 is for class 1 (good loan)

  testPredictedAllClassProbs<-predict(mlp_classifier,as.matrix(test_inputs))
  
   # print(testPredictedAllClassProbs)
  # Probabilities for just class 1
  testPredictedClassProbs<-testPredictedAllClassProbs #[,2]

  # train data: vector with the expedcted output
  test_expected<-test[,positionClassOutput]

  measures<-NdetermineThreshold(test_expected=test_expected,
                                test_predicted=testPredictedClassProbs,
                                plot=plot,
                                title=myTitle)
 
  return(measures)
}
# ************************************************
# mlpNeural() : causing a kernel death 
#
# SHALLOW BASIC MLP TRAINING
# Uses Keras or h2o library to create a basic 3 layer MLP
#
# INPUT   :
#         :   Data Frame     - rawDataset  - original dataset
#             boolean        - plot        - TRUE = output charts/results
#
# OUTPUT  :
#         :   Data Frame     - measures  - performance metrics
#
# 311019NRTY Updated to use either h2o or Keras library
# ************************************************
mlpNeural<-function(train,test, plot=TRUE){

  myTitle<-paste("Neural Network")
  print(myTitle)

  # Set to TRUE to use the h2o library
  # otherwise FALSE to try to use the Keras library

  if (FALSE) {
    N_DEEP_Initialise()

    mlp_classifier<-N_DEEP_TrainClassifier(train=train,
                                           fieldNameOutput=OUTPUT_FIELD,
                                           hidden=BASICNN_HIDDEN,
                                           stopping_rounds=DEEP_STOPPING,
                                           stopping_tolerance=DEEP_TOLERANCE,
                                           activation=DEEP_ACTIVATION,
                                           reproducible=DEEP_REPRODUCABLE)

    plot(mlp_classifier,metric="classification_error")

    # Evaluate the deep NN as we have done previously
    measures<-N_EVALUATE_DeepNeural(test=test,
                                    fieldNameOutput=OUTPUT_FIELD,
                                    deep=mlp_classifier,
                                    plot=plot,
                                    myTitle = myTitle)
  } else {

    mlp_classifier<-N_MLP_TrainClassifier(train=train,
                                          fieldNameOutput=OUTPUT_FIELD,
                                        
                                          plot=plot)

    measures<-N_evaluate_MLP(test=test,
                             fieldNameOutput=OUTPUT_FIELD,
                             mlp_classifier=mlp_classifier,
                             plot=plot,
                             myTitle=myTitle)
  } #endof if()
    
  return(list("model"=mlp_classifier,"result"=measures))
} #endof mlpNeural()
In [115]:
options(repr.plot.width = 10, repr.plot.height =8)

model<-mlpNeural(training_data,testing_data,plot=TRUE)
[1] "Neural Network"
Loaded Tensorflow version 2.9.3

In [116]:
result<-model$result
In [117]:
options(repr.plot.width = 10, repr.plot.height =8)
ConfusionMatrix_plot(result$gt,result$pred_labels)

accuracy<-result$acc
precision<-result$pgood
recall<-result$TPR
auc <- result$AUC
f1_score <- F1_score(precision,recall)
loss <- BCE_loss(result$gt,result$proba)
threshold<- result$threshold
print(paste("ACCURACY : ", accuracy,
            ", PRECISION: ", precision,
            ", RECALL: ", recall, 
            ", AUC : ",auc,
            "F1 score:", f1_score,
            "Loss :", loss,
            "Threshold :", threshold))
[1] "ACCURACY :  92.0824350682681 , PRECISION:  92.490678989655 , RECALL:  91.410629022213 , AUC :  0.975746762350948 F1 score: 91.9474824462948 Loss : 0.19880978781506 Threshold : 0.42"
In [118]:
df <-unlist(result[-which(names(result) %in%c("pred_labels","gt","proba"))])
#allResults<-
allResults<-cbind(allResults,data.frame(NeuralNetwork=unlist(df)))
allResults
A data.frame: 14 × 11
LogisticRegressionLogistic_with_KfoldDecisionTreeDecisionTree_with_KfoldDecisionTree_uncleanDecisionTree_Kfold_UncleanRandomForestRandomForestKfoldRandomForest_uncleanRandomForest_Kfold_uncleanNeuralNetwork
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
TP3.090200e+0410345.003.566700e+0411940.003.775400e+0412683.003.625200e+0420120.003.832500e+0420135.001.761300e+04
FN7.486000e+03 2405.002.721000e+03 811.001.671000e+03 541.001.932000e+03 1132.001.505000e+03 1117.001.655000e+03
FP5.755000e+03 1928.002.032000e+03 631.002.406000e+03 787.001.887000e+03 927.001.806000e+03 964.001.430000e+03
TN3.378500e+0411296.003.750800e+0412593.003.609700e+0411963.003.785700e+0421113.003.629200e+0421076.001.826600e+04
F18.235592e+01 82.689.375320e+01 94.319.487718e+01 95.029.499627e+01 95.139.585923e+01 95.099.194748e+01
acc8.300867e+01 83.319.390078e+01 94.459.476825e+01 94.889.509932e+01 95.249.575121e+01 95.199.208244e+01
pgood8.430041e+01 84.309.460994e+01 94.989.400896e+01 94.159.505231e+01 95.599.549974e+01 95.439.249068e+01
pbad8.186136e+01 82.459.323622e+01 93.959.557562e+01 95.679.514439e+01 94.919.601820e+01 94.979.169218e+01
FPR1.455488e+01 14.585.139100e+00 4.776.248864e+00 6.184.747886e+00 4.214.740406e+00 4.387.260357e+00
TPR8.049911e+01 81.139.291185e+01 93.649.576157e+01 95.919.494029e+01 94.679.622144e+01 94.749.141063e+01
TNR8.544512e+01 85.429.486090e+01 95.239.375114e+01 93.829.525211e+01 95.799.525959e+01 95.629.273964e+01
MCC6.605291e-01 0.678.780945e-01 0.898.954864e-01 0.909.019455e-01 0.909.149949e-01 0.908.416657e-01
threshold5.100000e-01 0.514.200000e-01 0.386.300000e-01 0.634.300000e-01 0.465.300000e-01 0.454.200000e-01
AUC9.002825e-01 0.909.830880e-01 0.999.869401e-01 0.999.897908e-01 0.999.926023e-01 0.999.757468e-01

Visualize a performance of each model¶

The performance metrics use in evaluating between multiple model are

  • Accuracy
  • Area Under the Curve
  • F1 Score
In [119]:
# select only the required metrics
rplot<-allResults[c("acc","F1","AUC"),]
# normalize accuracy and f1 by 100 (so that they is in same scale as AUC for easier visualization)
rplot[c("acc","F1"),]<-rplot[c("acc","F1"),]/100
rplot
A data.frame: 3 × 11
LogisticRegressionLogistic_with_KfoldDecisionTreeDecisionTree_with_KfoldDecisionTree_uncleanDecisionTree_Kfold_UncleanRandomForestRandomForestKfoldRandomForest_uncleanRandomForest_Kfold_uncleanNeuralNetwork
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
acc0.83008670.83310.93900780.94450.94768250.94880.95099320.95240.95751210.95190.9208244
F10.82355920.82680.93753200.94310.94877180.95020.94996270.95130.95859230.95090.9194748
AUC0.90028250.90000.98308800.99000.98694010.99000.98979080.99000.99260230.99000.9757468
In [ ]:

In [120]:
idx_kfold<-grepl("Kfold",names(rplot))
kfold_plot<-rplot[,idx_kfold]
normal_plot<-rplot[,!idx_kfold]
In [121]:
# stack a dataframe and combine with the fields of metrics name for each rows
kfold_plot<-cbind(stack(kfold_plot),data.frame(metrics=row.names(kfold_plot)))
normal_plot<-cbind(stack(normal_plot), data.frame(metrics=row.names(normal_plot)))
head(kfold_plot)
head(normal_plot)
A data.frame: 6 × 3
valuesindmetrics
<dbl><fct><chr>
10.8331Logistic_with_Kfold acc
20.8268Logistic_with_Kfold F1
30.9000Logistic_with_Kfold AUC
40.9445DecisionTree_with_Kfoldacc
50.9431DecisionTree_with_KfoldF1
60.9900DecisionTree_with_KfoldAUC
A data.frame: 6 × 3
valuesindmetrics
<dbl><fct><chr>
10.8300867LogisticRegressionacc
20.8235592LogisticRegressionF1
30.9002825LogisticRegressionAUC
40.9390078DecisionTree acc
50.9375320DecisionTree F1
60.9830880DecisionTree AUC
In [122]:
options(repr.plot.width = 20, repr.plot.height =5)
# Grouped
ggplot(normal_plot, aes(fill=metrics, y=values, x=ind)) + 
geom_bar(position="dodge", stat="identity",colour="black")+
scale_fill_discrete(labels=c("Accuracy","AUC","F1 Score"))+
labs(x= "Models", title="Results of each model on Airline Passenger Dataset")  +
theme(plot.title = element_text(face="bold",hjust=0.5, size =20),
      axis.title.x = element_text(face="bold", color="black", size=12),
      axis.title.y = element_text(face="bold", color="black", size=12),
      axis.text.x = element_text( face="bold", color='black', size=12),
      legend.title = element_text(face="bold", color="black", size=12),
      legend.text = element_text(size = 14),
      legend.key.size= unit(1.0,"cm"),
      legend.key.width = unit(1,"cm"),
      legend.box.spacing = unit(0.4,"cm")
      )
In [123]:
options(repr.plot.width = 20, repr.plot.height =5)
# Grouped
ggplot(kfold_plot, aes(fill=metrics, y=values, x=ind)) + 
geom_bar(position="dodge", stat="identity",colour="black")+
scale_fill_discrete(labels=c("Accuracy","AUC","F1 Score"))+
labs(x= "Models", title="Results of each model on Airline Passenger Dataset using K-fold cross validation")  +
theme(plot.title = element_text(face="bold",hjust=0.5, size =20),
      axis.title.x = element_text(face="bold", color="black", size=12),
      axis.title.y = element_text(face="bold", color="black", size=12),
      legend.title = element_text(face="bold", color="black", size=12),
            axis.text.x = element_text( face="bold", color='black', size=12),
      legend.text = element_text(size = 14),
      legend.key.size= unit(1.0,"cm"),
      legend.key.width = unit(1,"cm"),
      legend.box.spacing = unit(0.4,"cm")
      )

K means Clustering¶

In [129]:
OUTPUT_FIELD<-'satisfactionv2'

options(repr.plot.width = 10, repr.plot.height =8)
seed= 12
set.seed(seed)
select_field<-c('satisfactionv2',
              'Gender',
               'CustomerType',
              'TypeofTravel',

                 'ClassEco',
                 'ClassEco.Plus',
                'DepartureDelayinMinutes0',
               'DepartureDelayinMinutes1',
                'DepartureDelayinMinutes2',
   
                'ArrivalDelayinMinutes0',
                'ArrivalDelayinMinutes1',
               'ArrivalDelayinMinutes2',
                'ArrivalDelayinMinutes3',
               'Age',
               'Seatcomfort',
                'DepartureArrivaltimeconvenient',
                 'Foodanddrink',
                'Gatelocation',
               'Inflightwifiservice',
                'Inflightentertainment',
               'EaseofOnlinebooking',
                'Onboardservice',
                'Legroomservice',
                'Baggagehandling',
                'Checkinservice',
               'Cleanliness',
             'Onlineboarding'
               )

print("Select Fields")
print((select_field))


selected_data<-processed_data[,select_field]

predictors<-selected_data[,-1]
names(predictors)
[1] "Select Fields"
 [1] "satisfactionv2"                 "Gender"                        
 [3] "CustomerType"                   "TypeofTravel"                  
 [5] "ClassEco"                       "ClassEco.Plus"                 
 [7] "DepartureDelayinMinutes0"       "DepartureDelayinMinutes1"      
 [9] "DepartureDelayinMinutes2"       "ArrivalDelayinMinutes0"        
[11] "ArrivalDelayinMinutes1"         "ArrivalDelayinMinutes2"        
[13] "ArrivalDelayinMinutes3"         "Age"                           
[15] "Seatcomfort"                    "DepartureArrivaltimeconvenient"
[17] "Foodanddrink"                   "Gatelocation"                  
[19] "Inflightwifiservice"            "Inflightentertainment"         
[21] "EaseofOnlinebooking"            "Onboardservice"                
[23] "Legroomservice"                 "Baggagehandling"               
[25] "Checkinservice"                 "Cleanliness"                   
[27] "Onlineboarding"                
  1. 'Gender'
  2. 'CustomerType'
  3. 'TypeofTravel'
  4. 'ClassEco'
  5. 'ClassEco.Plus'
  6. 'DepartureDelayinMinutes0'
  7. 'DepartureDelayinMinutes1'
  8. 'DepartureDelayinMinutes2'
  9. 'ArrivalDelayinMinutes0'
  10. 'ArrivalDelayinMinutes1'
  11. 'ArrivalDelayinMinutes2'
  12. 'ArrivalDelayinMinutes3'
  13. 'Age'
  14. 'Seatcomfort'
  15. 'DepartureArrivaltimeconvenient'
  16. 'Foodanddrink'
  17. 'Gatelocation'
  18. 'Inflightwifiservice'
  19. 'Inflightentertainment'
  20. 'EaseofOnlinebooking'
  21. 'Onboardservice'
  22. 'Legroomservice'
  23. 'Baggagehandling'
  24. 'Checkinservice'
  25. 'Cleanliness'
  26. 'Onlineboarding'
In [130]:
seed= 12
set.seed(seed)
for( n in 2:10){
    modelKmeans <- kmeans(x=predictors, centers=n, nstart=10)
    p<-factoextra::fviz_cluster(modelKmeans, data = predictors,geom = "point")
    print(p)
    }
Warning message:
“Quick-TRANSfer stage steps exceeded maximum (= 12988000)”

Dimensionality Reduction¶

In [131]:
OUTPUT_FIELD<-'satisfactionv2'


seed= 12
set.seed(seed)
select_field<-c('satisfactionv2',
#              'Gender',
               'CustomerType',
#               'TypeofTravel',

                 'ClassEco',
                 'ClassEco.Plus',
#                 'DepartureDelayinMinutes0',
#                'DepartureDelayinMinutes1',
#                 'DepartureDelayinMinutes2',
   
#                 'ArrivalDelayinMinutes0',
#                 'ArrivalDelayinMinutes1',
#                'ArrivalDelayinMinutes2',
#                 'ArrivalDelayinMinutes3',
#                'Age',
 #               'Seatcomfort',
                'DepartureArrivaltimeconvenient',
                 'Foodanddrink',
#                 'Gatelocation',
#                'Inflightwifiservice',
 #                'Inflightentertainment'#,
               'EaseofOnlinebooking'#,
 #                'Onboardservice',
#                 'Legroomservice'
#                 'Baggagehandling'#,
 #                'Checkinservice',
 #               'Cleanliness',
#              'Onlineboarding'
               )

print("Select Fields")
print((select_field))


selected_data<-processed_data[,select_field]

predictors<-selected_data[,-1]
names(predictors)
[1] "Select Fields"
[1] "satisfactionv2"                 "CustomerType"                  
[3] "ClassEco"                       "ClassEco.Plus"                 
[5] "DepartureArrivaltimeconvenient" "Foodanddrink"                  
[7] "EaseofOnlinebooking"           
  1. 'CustomerType'
  2. 'ClassEco'
  3. 'ClassEco.Plus'
  4. 'DepartureArrivaltimeconvenient'
  5. 'Foodanddrink'
  6. 'EaseofOnlinebooking'
In [132]:
seed= 12
set.seed(seed)
for( n in 2:10){
    modelKmeans <- kmeans(x=predictors, centers=n, nstart=10)
    p<-factoextra::fviz_cluster(modelKmeans, data = predictors,geom = "point")
    print(p)
    }
In [ ]:

In [ ]:

In [135]:
seed= 12
set.seed(seed)
modelKmeans <- kmeans(x=predictors, centers=5, nstart=25)
p<-factoextra::fviz_cluster(modelKmeans, data = predictors, geom='point')
print(p)
In [136]:
print(str(modelKmeans))
List of 9
 $ cluster     : int [1:259760] 2 3 2 2 2 2 2 2 3 2 ...
 $ centers     : num [1:5, 1:6] 1 0 0 1 0 1 1 0 0 0 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : chr [1:5] "1" "2" "3" "4" ...
  .. ..$ : chr [1:6] "CustomerType" "ClassEco" "ClassEco.Plus" "DepartureArrivaltimeconvenient" ...
 $ totss       : num 185544
 $ withinss    : num [1:5] 6531 21397 25673 7040 4183
 $ tot.withinss: num 64824
 $ betweenss   : num 120720
 $ size        : int [1:5] 27268 89350 105858 20292 16992
 $ iter        : int 2
 $ ifault      : int 0
 - attr(*, "class")= chr "kmeans"
NULL
In [137]:
center<-modelKmeans$centers
center
A matrix: 5 × 6 of type dbl
CustomerTypeClassEcoClassEco.PlusDepartureArrivaltimeconvenientFoodanddrinkEaseofOnlinebooking
1110.000000000.46033450.56235150.5631143
2010.000000000.67701620.59367660.5992860
3000.000000000.59364810.63088290.6670937
4100.090183320.50195150.59154350.5939681
5001.000000000.64956450.59815210.6022128
In [ ]:

K Nearest Neighbour (KNN)¶

*This code for knn should be run on R-studio*

The dataset used for KNN is the same for Logistic regression and other models but in this case the processed data is save in processed.csv file

In [ ]:
#  clears all objects in "global environment"
rm(list=ls())
# Automatically release memory
gc()

# Clear plots and other graphics in RStudio
if(!is.null(dev.list())) dev.off()
graphics.off()

# Clears the RStudio console area
cat("\014")

set.seed(123)

#load the pre-processed data set into a data frame
dataset <- NreadDataset("processed.csv")
head(dataset)

#removes the X column that is not relevant to the task
dataset <- subset(dataset, select = -c(X))

MYLIBRARIES<-c(
  "KODAMA",
  "DMwR2")
# ************************************************
# NplotConfusion()
#
# Plot confusion matrix
#
# INPUT:    list - results - results from NcalcConfusion()
#
# OUTPUT :  NONE
#
# 070819NRT Plots confusion matrix
# ************************************************
NplotConfusion<-function(results){
  
  aa<-matrix(c(round(results$TP,digits=0),
               round(results$FN,digits=0),
               round(results$FP,digits=0),
               round(results$TN,digits=0)),
             nrow=2)
  row.names(aa)<-c("Fraud","Genuine")
  colnames(aa)<-c("Fraud","Genuine")
  fourfoldplot(aa,color=c("#cc6666","#99cc99"),
               conf.level=0,
               margin=2,
               main="TP  FP / FN   TN")
} #endof NplotConfusion()

# ************************************************
# NcalcMeasures() :
#
# Evaluation measures for a confusion matrix
#
# INPUT: numeric  - TP, FN, FP, TN
#
# OUTPUT: A list with the following entries:
#        TP        - double - True Positive records
#        FP        - double - False Positive records
#        TN        - double - True Negative records
#        FN        - double - False Negative records
#        accuracy  - double - accuracy measure
#        pgood     - double - precision for "good" (values are 1) measure
#        pbad      - double - precision for "bad" (values are 1) measure
#        FPR       - double - FPR measure
#        TPR       - double - FPR measure
#        TNR       - double - TNR measure
#        MCC       - double - Matthew's Correlation Coeficient
#
# 080819NRT added TNR measure
# ************************************************
NcalcMeasures<-function(TP,FN,FP,TN){
  
  retList<-list(  "TP"=TP,
                  "FN"=FN,
                  "TN"=TN,
                  "FP"=FP,
                  "accuracy"=100.0*((TP+TN)/(TP+FP+FN+TN)),
                  "pgood"=   100.0*(TP/(TP+FP)),
                  "pbad"=    100.0*(TN/(FN+TN)),
                  "FPR"=     100.0*(FP/(FP+TN)),
                  "TPR"=     100.0*(TP/(TP+FN)),
                  "TNR"=     100.0*(TN/(FP+TN)),
                  "MCC"=     ((TP*TN)-(FP*FN))/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
  )
  return(retList)
}

# ************************************************
# NcalcConfusion() :
#
# Calculate a confusion matrix for 2-class classifier
# INPUT: vector - expectedClass  - {0,1}, Expected outcome from each row (labels)
#        vector - predictedClass - {0,1}, Predicted outcome from each row (labels)
#
# OUTPUT: A list with the  entries from NcalcMeasures()
#
# 070819NRT convert values to doubles to avoid integers overflowing
# Updated to the following definition of the confusion matrix
#
# A good loan is indicated when $Status=1 and bad when $Status=0

#                    ACTUAL
#               ------------------
# PREDICTED     GOOD=1   |  BAD=0
#               ------------------
#     GOOD=1      TP     |    FP
#               ==================
#     BAD=0       FN     |    TN
#
#
# ************************************************
NcalcConfusion<-function(expectedClass,predictedClass){
  
  confusion<-table(factor(predictedClass,levels=0:1),factor(expectedClass,levels=0:1))
  
  # This "converts" the above into our preferred format
  
  TP<-as.double(confusion[2,2])
  FN<-as.double(confusion[1,2])
  FP<-as.double(confusion[2,1])
  TN<-as.double(confusion[1,1])
  
  return(NcalcMeasures(TP,FN,FP,TN))
  
} #endof NcalcConfusion()

# ************************************************
# NPREPROCESSING_removePunctuation()
#
# INPUT: String - fieldName - name of field
#
# OUTPUT : String - name of field with punctuation removed
# ************************************************
NPREPROCESSING_removePunctuation<-function(fieldName){
  return(gsub("[[:punct:][:blank:]]+", "", fieldName))
}

# ************************************************
# NreadDataset() :
#
# Read a CSV file from working directory
#
# INPUT: string - csvFilename - CSV filename
#
# OUTPUT : data frame - contents of the headed CSV file
# ************************************************
NreadDataset<-function(csvFilename){
  
  dataset<-read.csv(csvFilename,encoding="UTF-8",stringsAsFactors = FALSE)
  
  # The field names "confuse" some of the library algorithms
  # As they do not like spaces, punctuation, etc.
  names(dataset)<-NPREPROCESSING_removePunctuation(names(dataset))
  
  print(paste("CSV dataset",csvFilename,"has been read. Records=",nrow(dataset)))
  return(dataset)
}




##################################################################################################################




#loads the kodama library to preform KNN algorithm
#library(KODAMA)

library(pacman)
pacman::p_load(char=MYLIBRARIES,install=TRUE,character.only=TRUE)

# split the dataset in a 70/30 ratio

idxs <- sample(1:nrow(dataset), as.integer(0.7*nrow(dataset)))
train_data <- dataset[idxs,]

test_data <- dataset[-idxs,]

#x_train<- subset(train_data, select= -c(satisfactionv2))
#y_train<- train_data$satisfactionv2
#ap <- available.packages()

#nn3 <- kNN(satisfactionv2~.,train_data,test_data, stand=FALSE, k=3)
#confusion_matrix <- table(test_data[,"satisfactionv2"], nn3)

# Seperate predictors and predicted field for each set 
x_train<- subset(train_data, select= -c(satisfactionv2))
y_train<- train_data$satisfactionv2

x_test <- subset(test_data, select= -c(satisfactionv2))
y_test <- test_data$satisfactionv2


#Utilising the DWR2  library to classify the data

nn3 <- kNN(satisfactionv2~.,train_data,test_data, stand=FALSE, k=3)

confusion_matrix <- table(test_data[,"satisfactionv2"], nn3)

matrix<- as.data.frame.matrix(confusion_matrix)

matrix_metrics <- NcalcMeasures(matrix$`0`[1], matrix$`1`[2],matrix$`0`[2],matrix$`1`[1])

NplotConfusion(matrix_metrics)


# perform knn with k =3 using the kodama library
kodama_knn <- knn.kodama(x_train, y_train, x_test, k=3)
#results <- table(kodama_knn$Ypred[,3], y_test)

#determine the metrics to see the effectiveness of the models
metrics<-NcalcConfusion(y_test, kodama_knn$Ypred[,1])

NplotConfusion(metrics)